[clang] [llvm] [msan] Precommit MSan Arm NEON vst tests (PR #98247)

Thurston Dang via cfe-commits cfe-commits at lists.llvm.org
Tue Jul 16 13:00:36 PDT 2024


https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/98247

>From 2d65f143fffd39b4a7b29abef3372bd1a70159e5 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 9 Jul 2024 23:42:45 +0000
Subject: [PATCH 1/5] [msan] Precommit MSan Arm NEON vst tests

These tests show that MSan currently does not handle vst (or vld)
correctly.
---
 .../aarch64-neon-intrinsics-msan-vst.c        |  1250 ++
 .../CodeGen/aarch64-neon-intrinsics-msan.c    | 18071 ++++++++++++++++
 2 files changed, 19321 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
 create mode 100644 clang/test/CodeGen/aarch64-neon-intrinsics-msan.c

diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c b/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
new file mode 100644
index 0000000000000..c0cfe093a1a18
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
@@ -0,0 +1,1250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:     -S \
+// RUN:  -emit-llvm -o - %s -fsanitize=memory \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+#include <arm_neon.h>
+#include <sanitizer/msan_interface.h>
+
+// CHECK-LABEL: define dso_local noundef i32 @test_vst1(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
+// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[VEC1:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[DST1:%.*]] = alloca [8 x i16], align 2
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[VEC1]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC1]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
+// CHECK-NEXT:    store i16 15, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
+// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
+// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load i16, ptr [[TMP22]], align 2
+// CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD2]], i32 1
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
+// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load i16, ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[_MSLD4]], i32 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load i16, ptr [[TMP30]], align 2
+// CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[_MSLD6]], i32 3
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
+// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load i16, ptr [[TMP34]], align 2
+// CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[_MSLD8]], i32 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
+// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load i16, ptr [[TMP38]], align 2
+// CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <8 x i16> [[_MSPROP9]], i16 [[_MSLD10]], i32 5
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
+// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD12:%.*]] = load i16, ptr [[TMP42]], align 2
+// CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <8 x i16> [[_MSPROP11]], i16 [[_MSLD12]], i32 6
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
+// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
+// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
+// CHECK-NEXT:    [[_MSLD14:%.*]] = load i16, ptr [[TMP46]], align 2
+// CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <8 x i16> [[_MSPROP13]], i16 [[_MSLD14]], i32 7
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
+// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
+// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSPROP15]], ptr [[TMP49]], align 16
+// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
+// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+// CHECK-NEXT:    [[_MSLD16:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
+// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD16]], ptr [[TMP56]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
+// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+// CHECK-NEXT:    [[_MSLD17:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[VEC1]] to i64
+// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
+// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD17]], ptr [[TMP63]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[VEC1]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[DST1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST1]] to i64
+// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
+// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
+// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP70:%.*]] = load <8 x i16>, ptr [[VEC1]], align 16
+// CHECK-NEXT:    [[TMP71:%.*]] = ptrtoint ptr [[VEC1]] to i64
+// CHECK-NEXT:    [[TMP72:%.*]] = xor i64 [[TMP71]], 193514046488576
+// CHECK-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr
+// CHECK-NEXT:    [[_MSLD18:%.*]] = load <8 x i16>, ptr [[TMP73]], align 16
+// CHECK-NEXT:    [[TMP74:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP75:%.*]] = xor i64 [[TMP74]], 193514046488576
+// CHECK-NEXT:    [[TMP76:%.*]] = inttoptr i64 [[TMP75]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD18]], ptr [[TMP76]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP70]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
+// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+// CHECK-NEXT:    [[_MSLD19:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
+// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD19]] to <16 x i8>
+// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
+// CHECK-NEXT:    [[TMP83:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
+// CHECK-NEXT:    [[TMP84:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
+// CHECK-NEXT:    [[TMP85:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
+// CHECK-NEXT:    [[TMP86:%.*]] = xor i64 [[TMP85]], 193514046488576
+// CHECK-NEXT:    [[TMP87:%.*]] = inttoptr i64 [[TMP86]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[TMP83]], ptr [[TMP87]], align 2
+// CHECK-NEXT:    store <8 x i16> [[TMP84]], ptr [[ARRAYDECAY]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 0
+// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY1]], i64 noundef 16)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP88:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP89:%.*]] = xor i64 [[TMP88]], 193514046488576
+// CHECK-NEXT:    [[TMP90:%.*]] = inttoptr i64 [[TMP89]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP90]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP92:%.*]] = xor i64 [[TMP91]], 193514046488576
+// CHECK-NEXT:    [[TMP93:%.*]] = inttoptr i64 [[TMP92]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP93]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP94:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP95:%.*]] = xor i64 [[TMP94]], 193514046488576
+// CHECK-NEXT:    [[TMP96:%.*]] = inttoptr i64 [[TMP95]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP96]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP97:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP98:%.*]] = xor i64 [[TMP97]], 193514046488576
+// CHECK-NEXT:    [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP99]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP100:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP101:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP102:%.*]] = xor i64 [[TMP101]], 193514046488576
+// CHECK-NEXT:    [[TMP103:%.*]] = inttoptr i64 [[TMP102]] to ptr
+// CHECK-NEXT:    [[_MSLD20:%.*]] = load i32, ptr [[TMP103]], align 4
+// CHECK-NEXT:    [[_MSPROP21:%.*]] = or i32 [[_MSLD20]], 0
+// CHECK-NEXT:    [[TMP104:%.*]] = icmp ne i32 [[_MSPROP21]], 0
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP100]], 8
+// CHECK-NEXT:    br i1 [[TMP104]], label [[TMP105:%.*]], label [[TMP106:%.*]], !prof [[PROF2:![0-9]+]]
+// CHECK:       105:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    unreachable
+// CHECK:       106:
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    br label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP108:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP109:%.*]] = xor i64 [[TMP108]], 193514046488576
+// CHECK-NEXT:    [[TMP110:%.*]] = inttoptr i64 [[TMP109]] to ptr
+// CHECK-NEXT:    [[_MSLD22:%.*]] = load i32, ptr [[TMP110]], align 4
+// CHECK-NEXT:    [[_MSPROP23:%.*]] = sext i32 [[_MSLD22]] to i64
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP107]] to i64
+// CHECK-NEXT:    [[_MSPROP24:%.*]] = or i64 0, [[_MSPROP23]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP24]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP111:%.*]], label [[TMP112:%.*]], !prof [[PROF2]]
+// CHECK:       111:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       112:
+// CHECK-NEXT:    [[TMP113:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// CHECK-NEXT:    [[TMP114:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP115:%.*]] = xor i64 [[TMP114]], 193514046488576
+// CHECK-NEXT:    [[TMP116:%.*]] = inttoptr i64 [[TMP115]] to ptr
+// CHECK-NEXT:    [[_MSLD25:%.*]] = load i16, ptr [[TMP116]], align 2
+// CHECK-NEXT:    [[_MSPROP26:%.*]] = sext i16 [[_MSLD25]] to i32
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP113]] to i32
+// CHECK-NEXT:    [[TMP117:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP119:%.*]] = xor i64 [[TMP118]], 193514046488576
+// CHECK-NEXT:    [[TMP120:%.*]] = inttoptr i64 [[TMP119]] to ptr
+// CHECK-NEXT:    [[_MSLD27:%.*]] = load i32, ptr [[TMP120]], align 4
+// CHECK-NEXT:    [[_MSPROP28:%.*]] = or i32 [[_MSLD27]], [[_MSPROP26]]
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP117]], [[CONV]]
+// CHECK-NEXT:    [[TMP121:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP122:%.*]] = xor i64 [[TMP121]], 193514046488576
+// CHECK-NEXT:    [[TMP123:%.*]] = inttoptr i64 [[TMP122]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP28]], ptr [[TMP123]], align 4
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP124:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP125:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP126:%.*]] = xor i64 [[TMP125]], 193514046488576
+// CHECK-NEXT:    [[TMP127:%.*]] = inttoptr i64 [[TMP126]] to ptr
+// CHECK-NEXT:    [[_MSLD29:%.*]] = load i32, ptr [[TMP127]], align 4
+// CHECK-NEXT:    [[_MSPROP30:%.*]] = or i32 [[_MSLD29]], 0
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP124]], 1
+// CHECK-NEXT:    [[TMP128:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP129:%.*]] = xor i64 [[TMP128]], 193514046488576
+// CHECK-NEXT:    [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP30]], ptr [[TMP130]], align 4
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP132:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP133:%.*]] = xor i64 [[TMP132]], 193514046488576
+// CHECK-NEXT:    [[TMP134:%.*]] = inttoptr i64 [[TMP133]] to ptr
+// CHECK-NEXT:    [[_MSLD31:%.*]] = load i32, ptr [[TMP134]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[DST1]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[VEC1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[_MSCMP32:%.*]] = icmp ne i32 [[_MSLD31]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP32]], label [[TMP135:%.*]], label [[TMP136:%.*]], !prof [[PROF2]]
+// CHECK:       135:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       136:
+// CHECK-NEXT:    ret i32 [[TMP131]]
+//
+int test_vst1(void) {
+  int16x8_t vec1;
+  vec1 = vdupq_n_s16(15);
+  int16_t dst1[8*1];
+  vst1q_s16(dst1, vec1);
+
+  __msan_print_shadow(dst1, sizeof(int16_t)*8*1);
+
+  int sum = 0;
+  for (int i = 0; i < 8*1; i++)
+    sum += dst1[i];
+
+  return sum;
+}
+
+// Initialization is only partial to make the shadows more interesting
+// CHECK-LABEL: define dso_local noundef i32 @test_vst2(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
+// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[VEC2:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[DST2:%.*]] = alloca [16 x i16], align 2
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[VEC2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC2]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
+// CHECK-NEXT:    store i16 16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
+// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
+// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load i16, ptr [[TMP22]], align 2
+// CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD7]], i32 1
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
+// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load i16, ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <8 x i16> [[_MSPROP8]], i16 [[_MSLD9]], i32 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP30]], align 2
+// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP10]], i16 [[_MSLD11]], i32 3
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
+// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP34]], align 2
+// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
+// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP38]], align 2
+// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 5
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
+// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP42]], align 2
+// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 6
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
+// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
+// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
+// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP46]], align 2
+// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 7
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
+// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
+// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSPROP20]], ptr [[TMP49]], align 16
+// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
+// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+// CHECK-NEXT:    [[_MSLD21:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
+// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD21]], ptr [[TMP56]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
+// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+// CHECK-NEXT:    [[_MSLD22:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[VEC2]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
+// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD22]], ptr [[TMP63]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[DST2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST2]] to i64
+// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
+// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
+// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC2]], i64 32)
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 0
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
+// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+// CHECK-NEXT:    [[_MSLD23:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
+// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD23]] to <16 x i8>
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
+// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+// CHECK-NEXT:    [[_MSLD24:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
+// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD24]] to <16 x i8>
+// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
+// CHECK-NEXT:    [[TMP83:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
+// CHECK-NEXT:    [[TMP84:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
+// CHECK-NEXT:    [[TMP85:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
+// CHECK-NEXT:    [[TMP86:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
+// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[TMP83]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP87]], 0
+// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP85]] to i128
+// CHECK-NEXT:    [[_MSCMP37:%.*]] = icmp ne i128 [[TMP88]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP37]]
+// CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP89:%.*]], label [[TMP90:%.*]], !prof [[PROF2]]
+// CHECK:       89:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       90:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP84]], <8 x i16> [[TMP86]], ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 0
+// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY5]], i64 noundef 32)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP92:%.*]] = xor i64 [[TMP91]], 193514046488576
+// CHECK-NEXT:    [[TMP93:%.*]] = inttoptr i64 [[TMP92]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP93]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP94:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP95:%.*]] = xor i64 [[TMP94]], 193514046488576
+// CHECK-NEXT:    [[TMP96:%.*]] = inttoptr i64 [[TMP95]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP96]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP97:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP98:%.*]] = xor i64 [[TMP97]], 193514046488576
+// CHECK-NEXT:    [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP99]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP100:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP101:%.*]] = xor i64 [[TMP100]], 193514046488576
+// CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP102]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP103:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP104:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP105:%.*]] = xor i64 [[TMP104]], 193514046488576
+// CHECK-NEXT:    [[TMP106:%.*]] = inttoptr i64 [[TMP105]] to ptr
+// CHECK-NEXT:    [[_MSLD25:%.*]] = load i32, ptr [[TMP106]], align 4
+// CHECK-NEXT:    [[_MSPROP26:%.*]] = or i32 [[_MSLD25]], 0
+// CHECK-NEXT:    [[TMP107:%.*]] = icmp ne i32 [[_MSPROP26]], 0
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP103]], 16
+// CHECK-NEXT:    br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109:%.*]], !prof [[PROF2]]
+// CHECK:       108:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       109:
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    br label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP111:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP112:%.*]] = xor i64 [[TMP111]], 193514046488576
+// CHECK-NEXT:    [[TMP113:%.*]] = inttoptr i64 [[TMP112]] to ptr
+// CHECK-NEXT:    [[_MSLD27:%.*]] = load i32, ptr [[TMP113]], align 4
+// CHECK-NEXT:    [[_MSPROP28:%.*]] = sext i32 [[_MSLD27]] to i64
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP110]] to i64
+// CHECK-NEXT:    [[_MSPROP29:%.*]] = or i64 0, [[_MSPROP28]]
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[_MSCMP38:%.*]] = icmp ne i64 [[_MSPROP29]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP38]], label [[TMP114:%.*]], label [[TMP115:%.*]], !prof [[PROF2]]
+// CHECK:       114:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       115:
+// CHECK-NEXT:    [[TMP116:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
+// CHECK-NEXT:    [[TMP117:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP118:%.*]] = xor i64 [[TMP117]], 193514046488576
+// CHECK-NEXT:    [[TMP119:%.*]] = inttoptr i64 [[TMP118]] to ptr
+// CHECK-NEXT:    [[_MSLD30:%.*]] = load i16, ptr [[TMP119]], align 2
+// CHECK-NEXT:    [[_MSPROP31:%.*]] = sext i16 [[_MSLD30]] to i32
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP116]] to i32
+// CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP121:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP122:%.*]] = xor i64 [[TMP121]], 193514046488576
+// CHECK-NEXT:    [[TMP123:%.*]] = inttoptr i64 [[TMP122]] to ptr
+// CHECK-NEXT:    [[_MSLD32:%.*]] = load i32, ptr [[TMP123]], align 4
+// CHECK-NEXT:    [[_MSPROP33:%.*]] = or i32 [[_MSLD32]], [[_MSPROP31]]
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP120]], [[CONV]]
+// CHECK-NEXT:    [[TMP124:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP125:%.*]] = xor i64 [[TMP124]], 193514046488576
+// CHECK-NEXT:    [[TMP126:%.*]] = inttoptr i64 [[TMP125]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP33]], ptr [[TMP126]], align 4
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP128:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP129:%.*]] = xor i64 [[TMP128]], 193514046488576
+// CHECK-NEXT:    [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
+// CHECK-NEXT:    [[_MSLD34:%.*]] = load i32, ptr [[TMP130]], align 4
+// CHECK-NEXT:    [[_MSPROP35:%.*]] = or i32 [[_MSLD34]], 0
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP127]], 1
+// CHECK-NEXT:    [[TMP131:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP132:%.*]] = xor i64 [[TMP131]], 193514046488576
+// CHECK-NEXT:    [[TMP133:%.*]] = inttoptr i64 [[TMP132]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP35]], ptr [[TMP133]], align 4
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    [[TMP134:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP135:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP136:%.*]] = xor i64 [[TMP135]], 193514046488576
+// CHECK-NEXT:    [[TMP137:%.*]] = inttoptr i64 [[TMP136]] to ptr
+// CHECK-NEXT:    [[_MSLD36:%.*]] = load i32, ptr [[TMP137]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[DST2]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[VEC2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[_MSCMP39:%.*]] = icmp ne i32 [[_MSLD36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP39]], label [[TMP138:%.*]], label [[TMP139:%.*]], !prof [[PROF2]]
+// CHECK:       138:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       139:
+// CHECK-NEXT:    ret i32 [[TMP134]]
+//
+int test_vst2(void) {
+  int16x8x2_t vec2;
+  vec2.val[1] = vdupq_n_s16(16);
+  int16_t dst2[8*2];
+  vst2q_s16(dst2, vec2);
+
+  __msan_print_shadow(dst2, sizeof(int16_t)*8*2);
+
+  int sum = 0;
+  for (int i = 0; i < 8*2; i++)
+    sum += dst2[i];
+
+  return sum;
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @test_vst3(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
+// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[VEC3:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[DST3:%.*]] = alloca [24 x i16], align 2
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[VEC3]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC3]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
+// CHECK-NEXT:    store i16 17, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
+// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
+// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load i16, ptr [[TMP22]], align 2
+// CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD9]], i32 1
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
+// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP10]], i16 [[_MSLD11]], i32 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP30]], align 2
+// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 3
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
+// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP34]], align 2
+// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
+// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP38]], align 2
+// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 5
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
+// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP42]], align 2
+// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 6
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
+// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
+// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
+// CHECK-NEXT:    [[_MSLD21:%.*]] = load i16, ptr [[TMP46]], align 2
+// CHECK-NEXT:    [[_MSPROP22:%.*]] = insertelement <8 x i16> [[_MSPROP20]], i16 [[_MSLD21]], i32 7
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
+// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
+// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSPROP22]], ptr [[TMP49]], align 16
+// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
+// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+// CHECK-NEXT:    [[_MSLD23:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
+// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD23]], ptr [[TMP56]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
+// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+// CHECK-NEXT:    [[_MSLD24:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[VEC3]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
+// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD24]], ptr [[TMP63]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[DST3]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST3]] to i64
+// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
+// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
+// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC3]], i64 48)
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 0
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
+// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+// CHECK-NEXT:    [[_MSLD25:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
+// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD25]] to <16 x i8>
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
+// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+// CHECK-NEXT:    [[_MSLD26:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
+// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL5]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP83:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP84:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP85:%.*]] = xor i64 [[TMP84]], 193514046488576
+// CHECK-NEXT:    [[TMP86:%.*]] = inttoptr i64 [[TMP85]] to ptr
+// CHECK-NEXT:    [[_MSLD27:%.*]] = load <8 x i16>, ptr [[TMP86]], align 16
+// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[_MSLD27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP83]] to <16 x i8>
+// CHECK-NEXT:    [[TMP89:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
+// CHECK-NEXT:    [[TMP90:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
+// CHECK-NEXT:    [[TMP91:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
+// CHECK-NEXT:    [[TMP92:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
+// CHECK-NEXT:    [[TMP93:%.*]] = bitcast <16 x i8> [[TMP87]] to <8 x i16>
+// CHECK-NEXT:    [[TMP94:%.*]] = bitcast <16 x i8> [[TMP88]] to <8 x i16>
+// CHECK-NEXT:    [[TMP95:%.*]] = bitcast <8 x i16> [[TMP89]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP95]], 0
+// CHECK-NEXT:    [[TMP96:%.*]] = bitcast <8 x i16> [[TMP91]] to i128
+// CHECK-NEXT:    [[_MSCMP40:%.*]] = icmp ne i128 [[TMP96]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP40]]
+// CHECK-NEXT:    [[TMP97:%.*]] = bitcast <8 x i16> [[TMP93]] to i128
+// CHECK-NEXT:    [[_MSCMP41:%.*]] = icmp ne i128 [[TMP97]], 0
+// CHECK-NEXT:    [[_MSOR42:%.*]] = or i1 [[_MSOR]], [[_MSCMP41]]
+// CHECK-NEXT:    br i1 [[_MSOR42]], label [[TMP98:%.*]], label [[TMP99:%.*]], !prof [[PROF2]]
+// CHECK:       98:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       99:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP90]], <8 x i16> [[TMP92]], <8 x i16> [[TMP94]], ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 0
+// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY7]], i64 noundef 48)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP100:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP101:%.*]] = xor i64 [[TMP100]], 193514046488576
+// CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP102]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP103:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP104:%.*]] = xor i64 [[TMP103]], 193514046488576
+// CHECK-NEXT:    [[TMP105:%.*]] = inttoptr i64 [[TMP104]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP105]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP106:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP107:%.*]] = xor i64 [[TMP106]], 193514046488576
+// CHECK-NEXT:    [[TMP108:%.*]] = inttoptr i64 [[TMP107]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP108]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP109:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP110:%.*]] = xor i64 [[TMP109]], 193514046488576
+// CHECK-NEXT:    [[TMP111:%.*]] = inttoptr i64 [[TMP110]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP111]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP112:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP113:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP114:%.*]] = xor i64 [[TMP113]], 193514046488576
+// CHECK-NEXT:    [[TMP115:%.*]] = inttoptr i64 [[TMP114]] to ptr
+// CHECK-NEXT:    [[_MSLD28:%.*]] = load i32, ptr [[TMP115]], align 4
+// CHECK-NEXT:    [[_MSPROP29:%.*]] = or i32 [[_MSLD28]], 0
+// CHECK-NEXT:    [[TMP116:%.*]] = icmp ne i32 [[_MSPROP29]], 0
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP112]], 24
+// CHECK-NEXT:    br i1 [[TMP116]], label [[TMP117:%.*]], label [[TMP118:%.*]], !prof [[PROF2]]
+// CHECK:       117:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       118:
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    br label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP119:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP121:%.*]] = xor i64 [[TMP120]], 193514046488576
+// CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
+// CHECK-NEXT:    [[_MSLD30:%.*]] = load i32, ptr [[TMP122]], align 4
+// CHECK-NEXT:    [[_MSPROP31:%.*]] = sext i32 [[_MSLD30]] to i64
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP119]] to i64
+// CHECK-NEXT:    [[_MSPROP32:%.*]] = or i64 0, [[_MSPROP31]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[_MSCMP43:%.*]] = icmp ne i64 [[_MSPROP32]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP43]], label [[TMP123:%.*]], label [[TMP124:%.*]], !prof [[PROF2]]
+// CHECK:       123:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       124:
+// CHECK-NEXT:    [[TMP125:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
+// CHECK-NEXT:    [[TMP126:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
+// CHECK-NEXT:    [[TMP127:%.*]] = xor i64 [[TMP126]], 193514046488576
+// CHECK-NEXT:    [[TMP128:%.*]] = inttoptr i64 [[TMP127]] to ptr
+// CHECK-NEXT:    [[_MSLD33:%.*]] = load i16, ptr [[TMP128]], align 2
+// CHECK-NEXT:    [[_MSPROP34:%.*]] = sext i16 [[_MSLD33]] to i32
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP125]] to i32
+// CHECK-NEXT:    [[TMP129:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP130:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP131:%.*]] = xor i64 [[TMP130]], 193514046488576
+// CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
+// CHECK-NEXT:    [[_MSLD35:%.*]] = load i32, ptr [[TMP132]], align 4
+// CHECK-NEXT:    [[_MSPROP36:%.*]] = or i32 [[_MSLD35]], [[_MSPROP34]]
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP129]], [[CONV]]
+// CHECK-NEXT:    [[TMP133:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP134:%.*]] = xor i64 [[TMP133]], 193514046488576
+// CHECK-NEXT:    [[TMP135:%.*]] = inttoptr i64 [[TMP134]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP36]], ptr [[TMP135]], align 4
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP137:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP138:%.*]] = xor i64 [[TMP137]], 193514046488576
+// CHECK-NEXT:    [[TMP139:%.*]] = inttoptr i64 [[TMP138]] to ptr
+// CHECK-NEXT:    [[_MSLD37:%.*]] = load i32, ptr [[TMP139]], align 4
+// CHECK-NEXT:    [[_MSPROP38:%.*]] = or i32 [[_MSLD37]], 0
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP136]], 1
+// CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP141:%.*]] = xor i64 [[TMP140]], 193514046488576
+// CHECK-NEXT:    [[TMP142:%.*]] = inttoptr i64 [[TMP141]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP38]], ptr [[TMP142]], align 4
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    [[TMP143:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP144:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP145:%.*]] = xor i64 [[TMP144]], 193514046488576
+// CHECK-NEXT:    [[TMP146:%.*]] = inttoptr i64 [[TMP145]] to ptr
+// CHECK-NEXT:    [[_MSLD39:%.*]] = load i32, ptr [[TMP146]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[DST3]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[VEC3]]) #[[ATTR4]]
+// CHECK-NEXT:    [[_MSCMP44:%.*]] = icmp ne i32 [[_MSLD39]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP44]], label [[TMP147:%.*]], label [[TMP148:%.*]], !prof [[PROF2]]
+// CHECK:       147:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       148:
+// CHECK-NEXT:    ret i32 [[TMP143]]
+//
+int test_vst3(void) {
+  int16x8x3_t vec3;
+  vec3.val[1] = vdupq_n_s16(17);
+  int16_t dst3[8*3];
+  vst3q_s16(dst3, vec3);
+
+  __msan_print_shadow(dst3, sizeof(int16_t)*8*3);
+
+  int sum = 0;
+  for (int i = 0; i < 8*3; i++)
+    sum += dst3[i];
+
+  return sum;
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @test_vst4(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
+// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[VEC4:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[DST4:%.*]] = alloca [32 x i16], align 2
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[VEC4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC4]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
+// CHECK-NEXT:    store i16 18, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
+// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
+// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP22]], align 2
+// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD11]], i32 1
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
+// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP30]], align 2
+// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 3
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
+// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP34]], align 2
+// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
+// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP38]], align 2
+// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 5
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
+// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD21:%.*]] = load i16, ptr [[TMP42]], align 2
+// CHECK-NEXT:    [[_MSPROP22:%.*]] = insertelement <8 x i16> [[_MSPROP20]], i16 [[_MSLD21]], i32 6
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
+// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
+// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
+// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
+// CHECK-NEXT:    [[_MSLD23:%.*]] = load i16, ptr [[TMP46]], align 2
+// CHECK-NEXT:    [[_MSPROP24:%.*]] = insertelement <8 x i16> [[_MSPROP22]], i16 [[_MSLD23]], i32 7
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
+// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
+// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSPROP24]], ptr [[TMP49]], align 16
+// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
+// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
+// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+// CHECK-NEXT:    [[_MSLD25:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
+// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD25]], ptr [[TMP56]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
+// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
+// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+// CHECK-NEXT:    [[_MSLD26:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[VEC4]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
+// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD26]], ptr [[TMP63]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[DST4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST4]] to i64
+// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
+// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
+// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC4]], i64 64)
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 0
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
+// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+// CHECK-NEXT:    [[_MSLD27:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
+// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
+// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+// CHECK-NEXT:    [[_MSLD28:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
+// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD28]] to <16 x i8>
+// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP83:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP84:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP85:%.*]] = xor i64 [[TMP84]], 193514046488576
+// CHECK-NEXT:    [[TMP86:%.*]] = inttoptr i64 [[TMP85]] to ptr
+// CHECK-NEXT:    [[_MSLD29:%.*]] = load <8 x i16>, ptr [[TMP86]], align 16
+// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[_MSLD29]] to <16 x i8>
+// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP83]] to <16 x i8>
+// CHECK-NEXT:    [[VAL7:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL7]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP89:%.*]] = load <8 x i16>, ptr [[ARRAYIDX8]], align 16
+// CHECK-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
+// CHECK-NEXT:    [[TMP91:%.*]] = xor i64 [[TMP90]], 193514046488576
+// CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
+// CHECK-NEXT:    [[_MSLD30:%.*]] = load <8 x i16>, ptr [[TMP92]], align 16
+// CHECK-NEXT:    [[TMP93:%.*]] = bitcast <8 x i16> [[_MSLD30]] to <16 x i8>
+// CHECK-NEXT:    [[TMP94:%.*]] = bitcast <8 x i16> [[TMP89]] to <16 x i8>
+// CHECK-NEXT:    [[TMP95:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
+// CHECK-NEXT:    [[TMP96:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
+// CHECK-NEXT:    [[TMP97:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
+// CHECK-NEXT:    [[TMP98:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
+// CHECK-NEXT:    [[TMP99:%.*]] = bitcast <16 x i8> [[TMP87]] to <8 x i16>
+// CHECK-NEXT:    [[TMP100:%.*]] = bitcast <16 x i8> [[TMP88]] to <8 x i16>
+// CHECK-NEXT:    [[TMP101:%.*]] = bitcast <16 x i8> [[TMP93]] to <8 x i16>
+// CHECK-NEXT:    [[TMP102:%.*]] = bitcast <16 x i8> [[TMP94]] to <8 x i16>
+// CHECK-NEXT:    [[TMP103:%.*]] = bitcast <8 x i16> [[TMP95]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP103]], 0
+// CHECK-NEXT:    [[TMP104:%.*]] = bitcast <8 x i16> [[TMP97]] to i128
+// CHECK-NEXT:    [[_MSCMP43:%.*]] = icmp ne i128 [[TMP104]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP43]]
+// CHECK-NEXT:    [[TMP105:%.*]] = bitcast <8 x i16> [[TMP99]] to i128
+// CHECK-NEXT:    [[_MSCMP44:%.*]] = icmp ne i128 [[TMP105]], 0
+// CHECK-NEXT:    [[_MSOR45:%.*]] = or i1 [[_MSOR]], [[_MSCMP44]]
+// CHECK-NEXT:    [[TMP106:%.*]] = bitcast <8 x i16> [[TMP101]] to i128
+// CHECK-NEXT:    [[_MSCMP46:%.*]] = icmp ne i128 [[TMP106]], 0
+// CHECK-NEXT:    [[_MSOR47:%.*]] = or i1 [[_MSOR45]], [[_MSCMP46]]
+// CHECK-NEXT:    br i1 [[_MSOR47]], label [[TMP107:%.*]], label [[TMP108:%.*]], !prof [[PROF2]]
+// CHECK:       107:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       108:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP96]], <8 x i16> [[TMP98]], <8 x i16> [[TMP100]], <8 x i16> [[TMP102]], ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 0
+// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY9]], i64 noundef 64)
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP109:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP110:%.*]] = xor i64 [[TMP109]], 193514046488576
+// CHECK-NEXT:    [[TMP111:%.*]] = inttoptr i64 [[TMP110]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP111]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP112:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP113:%.*]] = xor i64 [[TMP112]], 193514046488576
+// CHECK-NEXT:    [[TMP114:%.*]] = inttoptr i64 [[TMP113]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP114]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP115:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP116:%.*]] = xor i64 [[TMP115]], 193514046488576
+// CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP117]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP119:%.*]] = xor i64 [[TMP118]], 193514046488576
+// CHECK-NEXT:    [[TMP120:%.*]] = inttoptr i64 [[TMP119]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP120]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP122:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP123:%.*]] = xor i64 [[TMP122]], 193514046488576
+// CHECK-NEXT:    [[TMP124:%.*]] = inttoptr i64 [[TMP123]] to ptr
+// CHECK-NEXT:    [[_MSLD31:%.*]] = load i32, ptr [[TMP124]], align 4
+// CHECK-NEXT:    [[_MSPROP32:%.*]] = or i32 [[_MSLD31]], 0
+// CHECK-NEXT:    [[TMP125:%.*]] = icmp ne i32 [[_MSPROP32]], 0
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP121]], 32
+// CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF2]]
+// CHECK:       126:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       127:
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+// CHECK:       for.cond.cleanup:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
+// CHECK-NEXT:    br label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP128:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP129:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP130:%.*]] = xor i64 [[TMP129]], 193514046488576
+// CHECK-NEXT:    [[TMP131:%.*]] = inttoptr i64 [[TMP130]] to ptr
+// CHECK-NEXT:    [[_MSLD33:%.*]] = load i32, ptr [[TMP131]], align 4
+// CHECK-NEXT:    [[_MSPROP34:%.*]] = sext i32 [[_MSLD33]] to i64
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP128]] to i64
+// CHECK-NEXT:    [[_MSPROP35:%.*]] = or i64 0, [[_MSPROP34]]
+// CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[_MSCMP48:%.*]] = icmp ne i64 [[_MSPROP35]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP48]], label [[TMP132:%.*]], label [[TMP133:%.*]], !prof [[PROF2]]
+// CHECK:       132:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       133:
+// CHECK-NEXT:    [[TMP134:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+// CHECK-NEXT:    [[TMP135:%.*]] = ptrtoint ptr [[ARRAYIDX10]] to i64
+// CHECK-NEXT:    [[TMP136:%.*]] = xor i64 [[TMP135]], 193514046488576
+// CHECK-NEXT:    [[TMP137:%.*]] = inttoptr i64 [[TMP136]] to ptr
+// CHECK-NEXT:    [[_MSLD36:%.*]] = load i16, ptr [[TMP137]], align 2
+// CHECK-NEXT:    [[_MSPROP37:%.*]] = sext i16 [[_MSLD36]] to i32
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP134]] to i32
+// CHECK-NEXT:    [[TMP138:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP139:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP140:%.*]] = xor i64 [[TMP139]], 193514046488576
+// CHECK-NEXT:    [[TMP141:%.*]] = inttoptr i64 [[TMP140]] to ptr
+// CHECK-NEXT:    [[_MSLD38:%.*]] = load i32, ptr [[TMP141]], align 4
+// CHECK-NEXT:    [[_MSPROP39:%.*]] = or i32 [[_MSLD38]], [[_MSPROP37]]
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP138]], [[CONV]]
+// CHECK-NEXT:    [[TMP142:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP143:%.*]] = xor i64 [[TMP142]], 193514046488576
+// CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP39]], ptr [[TMP144]], align 4
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP145:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP147:%.*]] = xor i64 [[TMP146]], 193514046488576
+// CHECK-NEXT:    [[TMP148:%.*]] = inttoptr i64 [[TMP147]] to ptr
+// CHECK-NEXT:    [[_MSLD40:%.*]] = load i32, ptr [[TMP148]], align 4
+// CHECK-NEXT:    [[_MSPROP41:%.*]] = or i32 [[_MSLD40]], 0
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP145]], 1
+// CHECK-NEXT:    [[TMP149:%.*]] = ptrtoint ptr [[I]] to i64
+// CHECK-NEXT:    [[TMP150:%.*]] = xor i64 [[TMP149]], 193514046488576
+// CHECK-NEXT:    [[TMP151:%.*]] = inttoptr i64 [[TMP150]] to ptr
+// CHECK-NEXT:    store i32 [[_MSPROP41]], ptr [[TMP151]], align 4
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    [[TMP152:%.*]] = load i32, ptr [[SUM]], align 4
+// CHECK-NEXT:    [[TMP153:%.*]] = ptrtoint ptr [[SUM]] to i64
+// CHECK-NEXT:    [[TMP154:%.*]] = xor i64 [[TMP153]], 193514046488576
+// CHECK-NEXT:    [[TMP155:%.*]] = inttoptr i64 [[TMP154]] to ptr
+// CHECK-NEXT:    [[_MSLD42:%.*]] = load i32, ptr [[TMP155]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[DST4]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[VEC4]]) #[[ATTR4]]
+// CHECK-NEXT:    [[_MSCMP49:%.*]] = icmp ne i32 [[_MSLD42]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP49]], label [[TMP156:%.*]], label [[TMP157:%.*]], !prof [[PROF2]]
+// CHECK:       156:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       157:
+// CHECK-NEXT:    ret i32 [[TMP152]]
+//
+int test_vst4(void) {
+  int16x8x4_t vec4;
+  vec4.val[2] = vdupq_n_s16(18);
+  int16_t dst4[8*4];
+  vst4q_s16(dst4, vec4);
+
+  __msan_print_shadow(dst4, sizeof(int16_t)*8*4);
+
+  int sum = 0;
+  for (int i = 0; i < 8*4; i++)
+    sum += dst4[i];
+
+  return sum;
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP2]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARGC_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP5]], i8 -1, i64 4, i1 false)
+// CHECK-NEXT:    [[ARGV_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARGV_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[ARGC_ADDR]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 4
+// CHECK-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGV_ADDR]] to i64
+// CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 193514046488576
+// CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP17]], align 8
+// CHECK-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @test_vst1()
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @test_vst2()
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @test_vst3()
+// CHECK-NEXT:    [[CALL3:%.*]] = call noundef i32 @test_vst4()
+// CHECK-NEXT:    ret i32 0
+//
+int main (int argc, char* argv[]) {
+    test_vst1();
+    test_vst2();
+    test_vst3();
+    test_vst4();
+
+    return 0;
+}
+//.
+// CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+//.
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c b/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c
new file mode 100644
index 0000000000000..5f042b10a0c8e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c
@@ -0,0 +1,18071 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:     -S \
+// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s -fsanitize=memory \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// Forked from aarch64-neon-intrinsics.c
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2:![0-9]+]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
+//
+uint8x16_t test_vld1q_u8(uint8_t const *a) {
+  return vld1q_u8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
+//
+uint16x8_t test_vld1q_u16(uint16_t const *a) {
+  return vld1q_u16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i32> @test_vld1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i32> [[TMP32]]
+//
+uint32x4_t test_vld1q_u32(uint32_t const *a) {
+  return vld1q_u32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i64> @test_vld1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i64> [[TMP32]]
+//
+uint64x2_t test_vld1q_u64(uint64_t const *a) {
+  return vld1q_u64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
+//
+int8x16_t test_vld1q_s8(int8_t const *a) {
+  return vld1q_s8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
+//
+int16x8_t test_vld1q_s16(int16_t const *a) {
+  return vld1q_s16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i32> @test_vld1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i32> [[TMP32]]
+//
+int32x4_t test_vld1q_s32(int32_t const *a) {
+  return vld1q_s32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i64> @test_vld1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i64> [[TMP32]]
+//
+int64x2_t test_vld1q_s64(int64_t const *a) {
+  return vld1q_s64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x half> @test_vld1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x half>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x half>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x half>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x half> [[TMP32]]
+//
+float16x8_t test_vld1q_f16(float16_t const *a) {
+  return vld1q_f16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x float> @test_vld1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x float>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x float>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x float> [[TMP32]]
+//
+float32x4_t test_vld1q_f32(float32_t const *a) {
+  return vld1q_f32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x double> @test_vld1q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x double>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x double> [[TMP32]]
+//
+float64x2_t test_vld1q_f64(float64_t const *a) {
+  return vld1q_f64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
+//
+poly8x16_t test_vld1q_p8(poly8_t const *a) {
+  return vld1q_p8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
+//
+poly16x8_t test_vld1q_p16(poly16_t const *a) {
+  return vld1q_p16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+uint8x8_t test_vld1_u8(uint8_t const *a) {
+  return vld1_u8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+uint16x4_t test_vld1_u16(uint16_t const *a) {
+  return vld1_u16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
+//
+uint32x2_t test_vld1_u32(uint32_t const *a) {
+  return vld1_u32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
+//
+uint64x1_t test_vld1_u64(uint64_t const *a) {
+  return vld1_u64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+int8x8_t test_vld1_s8(int8_t const *a) {
+  return vld1_s8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+int16x4_t test_vld1_s16(int16_t const *a) {
+  return vld1_s16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
+//
+int32x2_t test_vld1_s32(int32_t const *a) {
+  return vld1_s32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
+//
+int64x1_t test_vld1_s64(int64_t const *a) {
+  return vld1_s64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x half> @test_vld1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x half>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x half>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x half>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x half> [[TMP32]]
+//
+float16x4_t test_vld1_f16(float16_t const *a) {
+  return vld1_f16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x float> @test_vld1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x float>, ptr [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x float> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x float> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x float>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x float> [[TMP32]]
+//
+float32x2_t test_vld1_f32(float32_t const *a) {
+  return vld1_f32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x double> @test_vld1_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x double>, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x double>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x double>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x double> [[TMP32]]
+//
+float64x1_t test_vld1_f64(float64_t const *a) {
+  return vld1_f64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+poly8x8_t test_vld1_p8(poly8_t const *a) {
+  return vld1_p8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+poly16x4_t test_vld1_p16(poly16_t const *a) {
+  return vld1_p16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_u8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+uint8x8_t test_vld1_u8_void(void *a) {
+  return vld1_u8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_u16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+uint16x4_t test_vld1_u16_void(void *a) {
+  return vld1_u16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_u32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
+//
+uint32x2_t test_vld1_u32_void(void *a) {
+  return vld1_u32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_u64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
+//
+uint64x1_t test_vld1_u64_void(void *a) {
+  return vld1_u64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_s8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+int8x8_t test_vld1_s8_void(void *a) {
+  return vld1_s8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_s16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+int16x4_t test_vld1_s16_void(void *a) {
+  return vld1_s16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_s32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
+//
+int32x2_t test_vld1_s32_void(void *a) {
+  return vld1_s32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_s64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
+//
+int64x1_t test_vld1_s64_void(void *a) {
+  return vld1_s64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x half> @test_vld1_f16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x half>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x half>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x half>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x half> [[TMP32]]
+//
+float16x4_t test_vld1_f16_void(void *a) {
+  return vld1_f16(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x float> @test_vld1_f32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x float>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <2 x float> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <2 x float> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x float>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <2 x float> [[TMP32]]
+//
+float32x2_t test_vld1_f32_void(void *a) {
+  return vld1_f32(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <1 x double> @test_vld1_f64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x double>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x double>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x double>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <1 x double> [[TMP32]]
+//
+float64x1_t test_vld1_f64_void(void *a) {
+  return vld1_f64(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_p8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
+//
+poly8x8_t test_vld1_p8_void(void *a) {
+  return vld1_p8(a);
+}
+
+// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_p16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
+// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
+// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
+// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
+// CHECK:       37:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       38:
+// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
+//
+poly16x4_t test_vld1_p16_void(void *a) {
+  return vld1_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP22]]
+//
+uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
+  return vld2q_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP22]]
+//
+uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
+  return vld2q_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP22]]
+//
+uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
+  return vld2q_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP22]]
+//
+uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
+  return vld2q_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP22]]
+//
+int8x16x2_t test_vld2q_s8(int8_t const *a) {
+  return vld2q_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP22]]
+//
+int16x8x2_t test_vld2q_s16(int16_t const *a) {
+  return vld2q_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP22]]
+//
+int32x4x2_t test_vld2q_s32(int32_t const *a) {
+  return vld2q_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP22]]
+//
+int64x2x2_t test_vld2q_s64(int64_t const *a) {
+  return vld2q_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP22]]
+//
+float16x8x2_t test_vld2q_f16(float16_t const *a) {
+  return vld2q_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP22]]
+//
+float32x4x2_t test_vld2q_f32(float32_t const *a) {
+  return vld2q_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP22]]
+//
+float64x2x2_t test_vld2q_f64(float64_t const *a) {
+  return vld2q_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP22]]
+//
+poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
+  return vld2q_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP22]]
+//
+poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
+  return vld2q_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP22]]
+//
+uint8x8x2_t test_vld2_u8(uint8_t const *a) {
+  return vld2_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP22]]
+//
+uint16x4x2_t test_vld2_u16(uint16_t const *a) {
+  return vld2_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP22]]
+//
+uint32x2x2_t test_vld2_u32(uint32_t const *a) {
+  return vld2_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X2_T]] [[TMP22]]
+//
+uint64x1x2_t test_vld2_u64(uint64_t const *a) {
+  return vld2_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP22]]
+//
+int8x8x2_t test_vld2_s8(int8_t const *a) {
+  return vld2_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP22]]
+//
+int16x4x2_t test_vld2_s16(int16_t const *a) {
+  return vld2_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP22]]
+//
+int32x2x2_t test_vld2_s32(int32_t const *a) {
+  return vld2_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X2_T]] [[TMP22]]
+//
+int64x1x2_t test_vld2_s64(int64_t const *a) {
+  return vld2_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP22]]
+//
+float16x4x2_t test_vld2_f16(float16_t const *a) {
+  return vld2_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP22]]
+//
+float32x2x2_t test_vld2_f32(float32_t const *a) {
+  return vld2_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP22]]
+//
+float64x1x2_t test_vld2_f64(float64_t const *a) {
+  return vld2_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP22]]
+//
+poly8x8x2_t test_vld2_p8(poly8_t const *a) {
+  return vld2_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP22]]
+//
+poly16x4x2_t test_vld2_p16(poly16_t const *a) {
+  return vld2_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X3_T]] [[TMP22]]
+//
+uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
+  return vld3q_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X3_T]] [[TMP22]]
+//
+uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
+  return vld3q_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X3_T]] [[TMP22]]
+//
+uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
+  return vld3q_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP22]]
+//
+uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
+  return vld3q_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X3_T]] [[TMP22]]
+//
+int8x16x3_t test_vld3q_s8(int8_t const *a) {
+  return vld3q_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X3_T]] [[TMP22]]
+//
+int16x8x3_t test_vld3q_s16(int16_t const *a) {
+  return vld3q_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X3_T]] [[TMP22]]
+//
+int32x4x3_t test_vld3q_s32(int32_t const *a) {
+  return vld3q_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP22]]
+//
+int64x2x3_t test_vld3q_s64(int64_t const *a) {
+  return vld3q_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X3_T]] [[TMP22]]
+//
+float16x8x3_t test_vld3q_f16(float16_t const *a) {
+  return vld3q_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X3_T]] [[TMP22]]
+//
+float32x4x3_t test_vld3q_f32(float32_t const *a) {
+  return vld3q_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP22]]
+//
+float64x2x3_t test_vld3q_f64(float64_t const *a) {
+  return vld3q_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X3_T]] [[TMP22]]
+//
+poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
+  return vld3q_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X3_T]] [[TMP22]]
+//
+poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
+  return vld3q_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X3_T]] [[TMP22]]
+//
+uint8x8x3_t test_vld3_u8(uint8_t const *a) {
+  return vld3_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X3_T]] [[TMP22]]
+//
+uint16x4x3_t test_vld3_u16(uint16_t const *a) {
+  return vld3_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X3_T]] [[TMP22]]
+//
+uint32x2x3_t test_vld3_u32(uint32_t const *a) {
+  return vld3_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X3_T]] [[TMP22]]
+//
+uint64x1x3_t test_vld3_u64(uint64_t const *a) {
+  return vld3_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X3_T]] [[TMP22]]
+//
+int8x8x3_t test_vld3_s8(int8_t const *a) {
+  return vld3_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X3_T]] [[TMP22]]
+//
+int16x4x3_t test_vld3_s16(int16_t const *a) {
+  return vld3_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X3_T]] [[TMP22]]
+//
+int32x2x3_t test_vld3_s32(int32_t const *a) {
+  return vld3_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X3_T]] [[TMP22]]
+//
+int64x1x3_t test_vld3_s64(int64_t const *a) {
+  return vld3_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X3_T]] [[TMP22]]
+//
+float16x4x3_t test_vld3_f16(float16_t const *a) {
+  return vld3_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X3_T]] [[TMP22]]
+//
+float32x2x3_t test_vld3_f32(float32_t const *a) {
+  return vld3_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP22]]
+//
+float64x1x3_t test_vld3_f64(float64_t const *a) {
+  return vld3_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X3_T]] [[TMP22]]
+//
+poly8x8x3_t test_vld3_p8(poly8_t const *a) {
+  return vld3_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X3_T]] [[TMP22]]
+//
+poly16x4x3_t test_vld3_p16(poly16_t const *a) {
+  return vld3_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X4_T]] [[TMP22]]
+//
+uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
+  return vld4q_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X4_T]] [[TMP22]]
+//
+uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
+  return vld4q_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X4_T]] [[TMP22]]
+//
+uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
+  return vld4q_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP22]]
+//
+uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
+  return vld4q_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X4_T]] [[TMP22]]
+//
+int8x16x4_t test_vld4q_s8(int8_t const *a) {
+  return vld4q_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X4_T]] [[TMP22]]
+//
+int16x8x4_t test_vld4q_s16(int16_t const *a) {
+  return vld4q_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X4_T]] [[TMP22]]
+//
+int32x4x4_t test_vld4q_s32(int32_t const *a) {
+  return vld4q_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP22]]
+//
+int64x2x4_t test_vld4q_s64(int64_t const *a) {
+  return vld4q_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X4_T]] [[TMP22]]
+//
+float16x8x4_t test_vld4q_f16(float16_t const *a) {
+  return vld4q_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X4_T]] [[TMP22]]
+//
+float32x4x4_t test_vld4q_f32(float32_t const *a) {
+  return vld4q_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP22]]
+//
+float64x2x4_t test_vld4q_f64(float64_t const *a) {
+  return vld4q_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X4_T]] [[TMP22]]
+//
+poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
+  return vld4q_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X4_T]] [[TMP22]]
+//
+poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
+  return vld4q_p16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X4_T]] [[TMP22]]
+//
+uint8x8x4_t test_vld4_u8(uint8_t const *a) {
+  return vld4_u8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X4_T]] [[TMP22]]
+//
+uint16x4x4_t test_vld4_u16(uint16_t const *a) {
+  return vld4_u16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X4_T]] [[TMP22]]
+//
+uint32x2x4_t test_vld4_u32(uint32_t const *a) {
+  return vld4_u32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X4_T]] [[TMP22]]
+//
+uint64x1x4_t test_vld4_u64(uint64_t const *a) {
+  return vld4_u64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X4_T]] [[TMP22]]
+//
+int8x8x4_t test_vld4_s8(int8_t const *a) {
+  return vld4_s8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X4_T]] [[TMP22]]
+//
+int16x4x4_t test_vld4_s16(int16_t const *a) {
+  return vld4_s16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X4_T]] [[TMP22]]
+//
+int32x2x4_t test_vld4_s32(int32_t const *a) {
+  return vld4_s32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X4_T]] [[TMP22]]
+//
+int64x1x4_t test_vld4_s64(int64_t const *a) {
+  return vld4_s64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X4_T]] [[TMP22]]
+//
+float16x4x4_t test_vld4_f16(float16_t const *a) {
+  return vld4_f16(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X4_T]] [[TMP22]]
+//
+float32x2x4_t test_vld4_f32(float32_t const *a) {
+  return vld4_f32(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP22]]
+//
+float64x1x4_t test_vld4_f64(float64_t const *a) {
+  return vld4_f64(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X4_T]] [[TMP22]]
+//
+poly8x8x4_t test_vld4_p8(poly8_t const *a) {
+  return vld4_p8(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X4_T]] [[TMP22]]
+//
+poly16x4x4_t test_vld4_p16(poly16_t const *a) {
+  return vld4_p16(a);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
+  vst1q_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
+  vst1q_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <4 x i32> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
+  vst1q_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
+  vst1q_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s8(int8_t *a, int8x16_t b) {
+  vst1q_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s16(int16_t *a, int16x8_t b) {
+  vst1q_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <4 x i32> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s32(int32_t *a, int32x4_t b) {
+  vst1q_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_s64(int64_t *a, int64x2_t b) {
+  vst1q_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x half> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x half>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <8 x half> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f16(float16_t *a, float16x8_t b) {
+  vst1q_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x float> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x float>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <4 x float> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f32(float32_t *a, float32x4_t b) {
+  vst1q_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x double> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x double>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <2 x double> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f64(float64_t *a, float64x2_t b) {
+  vst1q_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
+  vst1q_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
+// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
+  vst1q_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_u8(uint8_t *a, uint8x8_t b) {
+  vst1_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_u16(uint16_t *a, uint16x4_t b) {
+  vst1_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <2 x i32> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_u32(uint32_t *a, uint32x2_t b) {
+  vst1_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_u64(uint64_t *a, uint64x1_t b) {
+  vst1_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_s8(int8_t *a, int8x8_t b) {
+  vst1_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_s16(int16_t *a, int16x4_t b) {
+  vst1_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <2 x i32> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_s32(int32_t *a, int32x2_t b) {
+  vst1_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <1 x i64> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_s64(int64_t *a, int64x1_t b) {
+  vst1_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x half>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x half> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x half>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <4 x half> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f16(float16_t *a, float16x4_t b) {
+  vst1_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <2 x float> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x float>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x float> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x float>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
+// CHECK-NEXT:    store <2 x float> [[TMP33]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f32(float32_t *a, float32x2_t b) {
+  vst1_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x double>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <1 x double> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x double>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x double> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x double>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
+// CHECK-NEXT:    store <1 x double> [[TMP33]], ptr [[TMP22]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f64(float64_t *a, float64x1_t b) {
+  vst1_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+// CHECK:       30:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       31:
+// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
+// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
+// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_p8(poly8_t *a, poly8x8_t b) {
+  vst1_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
+// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
+// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
+// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
+// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
+// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
+// CHECK:       34:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       35:
+// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
+// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
+// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
+// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
+// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_p16(poly16_t *a, poly16x4_t b) {
+  vst1_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
+  vst2q_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
+  vst2q_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP34]], <4 x i32> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
+  vst2q_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
+  vst2q_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
+  vst2q_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
+  vst2q_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP34]], <4 x i32> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
+  vst2q_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
+  vst2q_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP34]], <8 x half> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
+  vst2q_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP34]], <4 x float> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
+  vst2q_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP34]], <2 x double> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
+  vst2q_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
+  vst2q_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
+  vst2q_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
+  vst2_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
+  vst2_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP34]], <2 x i32> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
+  vst2_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
+  vst2_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_s8(int8_t *a, int8x8x2_t b) {
+  vst2_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_s16(int16_t *a, int16x4x2_t b) {
+  vst2_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP34]], <2 x i32> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_s32(int32_t *a, int32x2x2_t b) {
+  vst2_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_s64(int64_t *a, int64x1x2_t b) {
+  vst2_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP34]], <4 x half> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_f16(float16_t *a, float16x4x2_t b) {
+  vst2_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP34]], <2 x float> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_f32(float32_t *a, float32x2x2_t b) {
+  vst2_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP34]], <1 x double> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_f64(float64_t *a, float64x1x2_t b) {
+  vst2_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
+// CHECK:       31:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       32:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
+  vst2_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
+  vst2_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
+  vst3q_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
+  vst3q_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP40]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
+  vst3q_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
+  vst3q_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
+  vst3q_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
+  vst3q_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP40]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
+  vst3q_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
+  vst3q_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x half> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x half>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP40]], <8 x half> [[TMP42]], <8 x half> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
+  vst3q_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x float> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x float>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP40]], <4 x float> [[TMP42]], <4 x float> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
+  vst3q_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP40]], <2 x double> [[TMP42]], <2 x double> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
+  vst3q_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
+  vst3q_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
+  vst3q_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
+  vst3_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
+  vst3_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP40]], <2 x i32> [[TMP42]], <2 x i32> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
+  vst3_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
+  vst3_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_s8(int8_t *a, int8x8x3_t b) {
+  vst3_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_s16(int16_t *a, int16x4x3_t b) {
+  vst3_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP40]], <2 x i32> [[TMP42]], <2 x i32> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_s32(int32_t *a, int32x2x3_t b) {
+  vst3_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_s64(int64_t *a, int64x1x3_t b) {
+  vst3_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x half> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x half>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP40]], <4 x half> [[TMP42]], <4 x half> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_f16(float16_t *a, float16x4x3_t b) {
+  vst3_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x float> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x float>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP40]], <2 x float> [[TMP42]], <2 x float> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_f32(float32_t *a, float32x2x3_t b) {
+  vst3_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP40]], <1 x double> [[TMP42]], <1 x double> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_f64(float64_t *a, float64x1x3_t b) {
+  vst3_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
+// CHECK:       36:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       37:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
+  vst3_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
+  vst3_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
+  vst4q_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
+  vst4q_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i32> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP46]], <4 x i32> [[TMP48]], <4 x i32> [[TMP50]], <4 x i32> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
+  vst4q_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
+  vst4q_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
+  vst4q_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
+  vst4q_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i32> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP46]], <4 x i32> [[TMP48]], <4 x i32> [[TMP50]], <4 x i32> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
+  vst4q_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
+  vst4q_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x half> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x half> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x half>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x half>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP46]], <8 x half> [[TMP48]], <8 x half> [[TMP50]], <8 x half> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
+  vst4q_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x float> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x float> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x float>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x float>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP46]], <4 x float> [[TMP48]], <4 x float> [[TMP50]], <4 x float> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
+  vst4q_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x double> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x double>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP46]], <2 x double> [[TMP48]], <2 x double> [[TMP50]], <2 x double> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
+  vst4q_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
+  vst4q_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
+  vst4q_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
+  vst4_u8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
+  vst4_u16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x i32>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP46]], <2 x i32> [[TMP48]], <2 x i32> [[TMP50]], <2 x i32> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
+  vst4_u32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
+  vst4_u64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_s8(int8_t *a, int8x8x4_t b) {
+  vst4_s8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_s16(int16_t *a, int16x4x4_t b) {
+  vst4_s16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x i32>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP46]], <2 x i32> [[TMP48]], <2 x i32> [[TMP50]], <2 x i32> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_s32(int32_t *a, int32x2x4_t b) {
+  vst4_s32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_s64(int64_t *a, int64x1x4_t b) {
+  vst4_s64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x half> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x half> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x half>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x half>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP46]], <4 x half> [[TMP48]], <4 x half> [[TMP50]], <4 x half> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_f16(float16_t *a, float16x4x4_t b) {
+  vst4_f16(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x float> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x float> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x float>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x float>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP46]], <2 x float> [[TMP48]], <2 x float> [[TMP50]], <2 x float> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_f32(float32_t *a, float32x2x4_t b) {
+  vst4_f32(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x double> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x double>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP46]], <1 x double> [[TMP48]], <1 x double> [[TMP50]], <1 x double> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_f64(float64_t *a, float64x1x4_t b) {
+  vst4_f64(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
+// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
+// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
+// CHECK:       41:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       42:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
+  vst4_p8(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
+  vst4_p16(a, b);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld1q_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP22]]
+//
+float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
+  return vld1q_f64_x2(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld1q_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X2_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP22]]
+//
+poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
+  return vld1q_p64_x2(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld1_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP22]]
+//
+float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
+  return vld1_f64_x2(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld1_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X2_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP22]]
+//
+poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
+  return vld1_p64_x2(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld1q_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP22]]
+//
+float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
+  return vld1q_f64_x3(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld1q_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X3_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP22]]
+//
+poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
+  return vld1q_p64_x3(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld1_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP22]]
+//
+float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
+  return vld1_f64_x3(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld1_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X3_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP22]]
+//
+poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
+  return vld1_p64_x3(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld1q_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP22]]
+//
+float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
+  return vld1q_f64_x4(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld1q_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X4_T]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
+// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP22]]
+//
+poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
+  return vld1q_p64_x4(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld1_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP22]]
+//
+float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
+  return vld1_f64_x4(a);
+}
+
+// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld1_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
+// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
+// CHECK:       16:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       17:
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[TMP12]])
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X4_T]], ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
+// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
+// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
+// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP22]]
+//
+poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
+  return vld1_p64_x4(a);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP34]], <2 x double> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
+  vst1q_f64_x2(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
+  vst1q_p64_x2(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP34]], <1 x double> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
+  vst1_f64_x2(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
+// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
+// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
+// CHECK:       39:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       40:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
+  vst1_p64_x2(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP40]], <2 x double> [[TMP42]], <2 x double> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
+  vst1q_f64_x3(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
+  vst1q_p64_x3(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP40]], <1 x double> [[TMP42]], <1 x double> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
+  vst1_f64_x3(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
+// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
+// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
+// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
+// CHECK:       48:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       49:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
+  vst1_p64_x3(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x double> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x double>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP46]], <2 x double> [[TMP48]], <2 x double> [[TMP50]], <2 x double> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
+  vst1q_f64_x4(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
+  vst1q_p64_x4(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x double> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x double>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP46]], <1 x double> [[TMP48]], <1 x double> [[TMP50]], <1 x double> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
+  vst1_f64_x4(a, b);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+// CHECK-NEXT:    call void @llvm.donothing()
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
+// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
+// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
+// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
+// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
+// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
+// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
+// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
+// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
+// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
+// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
+// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
+// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
+// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
+// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
+// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
+// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
+// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
+// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
+// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
+// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
+// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
+// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
+// CHECK:       57:
+// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+// CHECK-NEXT:    unreachable
+// CHECK:       58:
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
+  vst1_p64_x4(a, b);
+}
+//.
+// CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+//.

>From 3c00fe4f82a01b975960e911774e68ab3d418306 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 16 Jul 2024 16:56:11 +0000
Subject: [PATCH 2/5] Update test to use IR instead of C

---
 .../aarch64-neon-intrinsics-msan-vst.c        |  1250 --
 .../CodeGen/aarch64-neon-intrinsics-msan.c    | 18071 ----------------
 .../MemorySanitizer/AArch64/neon_vst.ll       |  1515 ++
 3 files changed, 1515 insertions(+), 19321 deletions(-)
 delete mode 100644 clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
 delete mode 100644 clang/test/CodeGen/aarch64-neon-intrinsics-msan.c
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll

diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c b/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
deleted file mode 100644
index c0cfe093a1a18..0000000000000
--- a/clang/test/CodeGen/aarch64-neon-intrinsics-msan-vst.c
+++ /dev/null
@@ -1,1250 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S \
-// RUN:  -emit-llvm -o - %s -fsanitize=memory \
-// RUN: | FileCheck %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-#include <arm_neon.h>
-#include <sanitizer/msan_interface.h>
-
-// CHECK-LABEL: define dso_local noundef i32 @test_vst1(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
-// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[VEC1:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[DST1:%.*]] = alloca [8 x i16], align 2
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[VEC1]]) #[[ATTR4:[0-9]+]]
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC1]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
-// CHECK-NEXT:    store i16 15, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
-// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
-// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load i16, ptr [[TMP22]], align 2
-// CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD2]], i32 1
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
-// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
-// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load i16, ptr [[TMP26]], align 2
-// CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[_MSLD4]], i32 2
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
-// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load i16, ptr [[TMP30]], align 2
-// CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[_MSLD6]], i32 3
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
-// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load i16, ptr [[TMP34]], align 2
-// CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[_MSLD8]], i32 4
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
-// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load i16, ptr [[TMP38]], align 2
-// CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <8 x i16> [[_MSPROP9]], i16 [[_MSLD10]], i32 5
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
-// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD12:%.*]] = load i16, ptr [[TMP42]], align 2
-// CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <8 x i16> [[_MSPROP11]], i16 [[_MSLD12]], i32 6
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
-// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
-// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
-// CHECK-NEXT:    [[_MSLD14:%.*]] = load i16, ptr [[TMP46]], align 2
-// CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <8 x i16> [[_MSPROP13]], i16 [[_MSLD14]], i32 7
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
-// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
-// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSPROP15]], ptr [[TMP49]], align 16
-// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
-// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
-// CHECK-NEXT:    [[_MSLD16:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
-// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD16]], ptr [[TMP56]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
-// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
-// CHECK-NEXT:    [[_MSLD17:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[VEC1]] to i64
-// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
-// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD17]], ptr [[TMP63]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[VEC1]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[DST1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST1]] to i64
-// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
-// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
-// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP70:%.*]] = load <8 x i16>, ptr [[VEC1]], align 16
-// CHECK-NEXT:    [[TMP71:%.*]] = ptrtoint ptr [[VEC1]] to i64
-// CHECK-NEXT:    [[TMP72:%.*]] = xor i64 [[TMP71]], 193514046488576
-// CHECK-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr
-// CHECK-NEXT:    [[_MSLD18:%.*]] = load <8 x i16>, ptr [[TMP73]], align 16
-// CHECK-NEXT:    [[TMP74:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP75:%.*]] = xor i64 [[TMP74]], 193514046488576
-// CHECK-NEXT:    [[TMP76:%.*]] = inttoptr i64 [[TMP75]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD18]], ptr [[TMP76]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP70]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
-// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
-// CHECK-NEXT:    [[_MSLD19:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD19]] to <16 x i8>
-// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
-// CHECK-NEXT:    [[TMP83:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
-// CHECK-NEXT:    [[TMP84:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
-// CHECK-NEXT:    [[TMP85:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
-// CHECK-NEXT:    [[TMP86:%.*]] = xor i64 [[TMP85]], 193514046488576
-// CHECK-NEXT:    [[TMP87:%.*]] = inttoptr i64 [[TMP86]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[TMP83]], ptr [[TMP87]], align 2
-// CHECK-NEXT:    store <8 x i16> [[TMP84]], ptr [[ARRAYDECAY]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 0
-// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY1]], i64 noundef 16)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP88:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP89:%.*]] = xor i64 [[TMP88]], 193514046488576
-// CHECK-NEXT:    [[TMP90:%.*]] = inttoptr i64 [[TMP89]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP90]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP92:%.*]] = xor i64 [[TMP91]], 193514046488576
-// CHECK-NEXT:    [[TMP93:%.*]] = inttoptr i64 [[TMP92]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP93]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP94:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP95:%.*]] = xor i64 [[TMP94]], 193514046488576
-// CHECK-NEXT:    [[TMP96:%.*]] = inttoptr i64 [[TMP95]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP96]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP97:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP98:%.*]] = xor i64 [[TMP97]], 193514046488576
-// CHECK-NEXT:    [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP99]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP100:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP101:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP102:%.*]] = xor i64 [[TMP101]], 193514046488576
-// CHECK-NEXT:    [[TMP103:%.*]] = inttoptr i64 [[TMP102]] to ptr
-// CHECK-NEXT:    [[_MSLD20:%.*]] = load i32, ptr [[TMP103]], align 4
-// CHECK-NEXT:    [[_MSPROP21:%.*]] = or i32 [[_MSLD20]], 0
-// CHECK-NEXT:    [[TMP104:%.*]] = icmp ne i32 [[_MSPROP21]], 0
-// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP100]], 8
-// CHECK-NEXT:    br i1 [[TMP104]], label [[TMP105:%.*]], label [[TMP106:%.*]], !prof [[PROF2:![0-9]+]]
-// CHECK:       105:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
-// CHECK-NEXT:    unreachable
-// CHECK:       106:
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP108:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP109:%.*]] = xor i64 [[TMP108]], 193514046488576
-// CHECK-NEXT:    [[TMP110:%.*]] = inttoptr i64 [[TMP109]] to ptr
-// CHECK-NEXT:    [[_MSLD22:%.*]] = load i32, ptr [[TMP110]], align 4
-// CHECK-NEXT:    [[_MSPROP23:%.*]] = sext i32 [[_MSLD22]] to i64
-// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP107]] to i64
-// CHECK-NEXT:    [[_MSPROP24:%.*]] = or i64 0, [[_MSPROP23]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i16], ptr [[DST1]], i64 0, i64 [[IDXPROM]]
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP24]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP111:%.*]], label [[TMP112:%.*]], !prof [[PROF2]]
-// CHECK:       111:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       112:
-// CHECK-NEXT:    [[TMP113:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-// CHECK-NEXT:    [[TMP114:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP115:%.*]] = xor i64 [[TMP114]], 193514046488576
-// CHECK-NEXT:    [[TMP116:%.*]] = inttoptr i64 [[TMP115]] to ptr
-// CHECK-NEXT:    [[_MSLD25:%.*]] = load i16, ptr [[TMP116]], align 2
-// CHECK-NEXT:    [[_MSPROP26:%.*]] = sext i16 [[_MSLD25]] to i32
-// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP113]] to i32
-// CHECK-NEXT:    [[TMP117:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP119:%.*]] = xor i64 [[TMP118]], 193514046488576
-// CHECK-NEXT:    [[TMP120:%.*]] = inttoptr i64 [[TMP119]] to ptr
-// CHECK-NEXT:    [[_MSLD27:%.*]] = load i32, ptr [[TMP120]], align 4
-// CHECK-NEXT:    [[_MSPROP28:%.*]] = or i32 [[_MSLD27]], [[_MSPROP26]]
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP117]], [[CONV]]
-// CHECK-NEXT:    [[TMP121:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP122:%.*]] = xor i64 [[TMP121]], 193514046488576
-// CHECK-NEXT:    [[TMP123:%.*]] = inttoptr i64 [[TMP122]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP28]], ptr [[TMP123]], align 4
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP124:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP125:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP126:%.*]] = xor i64 [[TMP125]], 193514046488576
-// CHECK-NEXT:    [[TMP127:%.*]] = inttoptr i64 [[TMP126]] to ptr
-// CHECK-NEXT:    [[_MSLD29:%.*]] = load i32, ptr [[TMP127]], align 4
-// CHECK-NEXT:    [[_MSPROP30:%.*]] = or i32 [[_MSLD29]], 0
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP124]], 1
-// CHECK-NEXT:    [[TMP128:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP129:%.*]] = xor i64 [[TMP128]], 193514046488576
-// CHECK-NEXT:    [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP30]], ptr [[TMP130]], align 4
-// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
-// CHECK:       for.end:
-// CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP132:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP133:%.*]] = xor i64 [[TMP132]], 193514046488576
-// CHECK-NEXT:    [[TMP134:%.*]] = inttoptr i64 [[TMP133]] to ptr
-// CHECK-NEXT:    [[_MSLD31:%.*]] = load i32, ptr [[TMP134]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[DST1]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[VEC1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[_MSCMP32:%.*]] = icmp ne i32 [[_MSLD31]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP32]], label [[TMP135:%.*]], label [[TMP136:%.*]], !prof [[PROF2]]
-// CHECK:       135:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       136:
-// CHECK-NEXT:    ret i32 [[TMP131]]
-//
-int test_vst1(void) {
-  int16x8_t vec1;
-  vec1 = vdupq_n_s16(15);
-  int16_t dst1[8*1];
-  vst1q_s16(dst1, vec1);
-
-  __msan_print_shadow(dst1, sizeof(int16_t)*8*1);
-
-  int sum = 0;
-  for (int i = 0; i < 8*1; i++)
-    sum += dst1[i];
-
-  return sum;
-}
-
-// Initialization is only partial to make the shadows more interesting
-// CHECK-LABEL: define dso_local noundef i32 @test_vst2(
-// CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
-// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[VEC2:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[DST2:%.*]] = alloca [16 x i16], align 2
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[VEC2]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC2]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
-// CHECK-NEXT:    store i16 16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
-// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
-// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load i16, ptr [[TMP22]], align 2
-// CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD7]], i32 1
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
-// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
-// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load i16, ptr [[TMP26]], align 2
-// CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <8 x i16> [[_MSPROP8]], i16 [[_MSLD9]], i32 2
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
-// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP30]], align 2
-// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP10]], i16 [[_MSLD11]], i32 3
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
-// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP34]], align 2
-// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 4
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
-// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP38]], align 2
-// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 5
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
-// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP42]], align 2
-// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 6
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
-// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
-// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
-// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP46]], align 2
-// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 7
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
-// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
-// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSPROP20]], ptr [[TMP49]], align 16
-// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
-// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
-// CHECK-NEXT:    [[_MSLD21:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
-// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD21]], ptr [[TMP56]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
-// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
-// CHECK-NEXT:    [[_MSLD22:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[VEC2]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
-// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD22]], ptr [[TMP63]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[DST2]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST2]] to i64
-// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
-// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
-// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC2]], i64 32)
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 0
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
-// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
-// CHECK-NEXT:    [[_MSLD23:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
-// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD23]] to <16 x i8>
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
-// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
-// CHECK-NEXT:    [[_MSLD24:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD24]] to <16 x i8>
-// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
-// CHECK-NEXT:    [[TMP83:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
-// CHECK-NEXT:    [[TMP84:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
-// CHECK-NEXT:    [[TMP85:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
-// CHECK-NEXT:    [[TMP86:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
-// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[TMP83]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP87]], 0
-// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP85]] to i128
-// CHECK-NEXT:    [[_MSCMP37:%.*]] = icmp ne i128 [[TMP88]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP37]]
-// CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP89:%.*]], label [[TMP90:%.*]], !prof [[PROF2]]
-// CHECK:       89:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       90:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP84]], <8 x i16> [[TMP86]], ptr [[ARRAYDECAY]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 0
-// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY5]], i64 noundef 32)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP92:%.*]] = xor i64 [[TMP91]], 193514046488576
-// CHECK-NEXT:    [[TMP93:%.*]] = inttoptr i64 [[TMP92]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP93]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP94:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP95:%.*]] = xor i64 [[TMP94]], 193514046488576
-// CHECK-NEXT:    [[TMP96:%.*]] = inttoptr i64 [[TMP95]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP96]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP97:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP98:%.*]] = xor i64 [[TMP97]], 193514046488576
-// CHECK-NEXT:    [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP99]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP100:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP101:%.*]] = xor i64 [[TMP100]], 193514046488576
-// CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP102]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP103:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP104:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP105:%.*]] = xor i64 [[TMP104]], 193514046488576
-// CHECK-NEXT:    [[TMP106:%.*]] = inttoptr i64 [[TMP105]] to ptr
-// CHECK-NEXT:    [[_MSLD25:%.*]] = load i32, ptr [[TMP106]], align 4
-// CHECK-NEXT:    [[_MSPROP26:%.*]] = or i32 [[_MSLD25]], 0
-// CHECK-NEXT:    [[TMP107:%.*]] = icmp ne i32 [[_MSPROP26]], 0
-// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP103]], 16
-// CHECK-NEXT:    br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109:%.*]], !prof [[PROF2]]
-// CHECK:       108:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       109:
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP111:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP112:%.*]] = xor i64 [[TMP111]], 193514046488576
-// CHECK-NEXT:    [[TMP113:%.*]] = inttoptr i64 [[TMP112]] to ptr
-// CHECK-NEXT:    [[_MSLD27:%.*]] = load i32, ptr [[TMP113]], align 4
-// CHECK-NEXT:    [[_MSPROP28:%.*]] = sext i32 [[_MSLD27]] to i64
-// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP110]] to i64
-// CHECK-NEXT:    [[_MSPROP29:%.*]] = or i64 0, [[_MSPROP28]]
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [16 x i16], ptr [[DST2]], i64 0, i64 [[IDXPROM]]
-// CHECK-NEXT:    [[_MSCMP38:%.*]] = icmp ne i64 [[_MSPROP29]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP38]], label [[TMP114:%.*]], label [[TMP115:%.*]], !prof [[PROF2]]
-// CHECK:       114:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       115:
-// CHECK-NEXT:    [[TMP116:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
-// CHECK-NEXT:    [[TMP117:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP118:%.*]] = xor i64 [[TMP117]], 193514046488576
-// CHECK-NEXT:    [[TMP119:%.*]] = inttoptr i64 [[TMP118]] to ptr
-// CHECK-NEXT:    [[_MSLD30:%.*]] = load i16, ptr [[TMP119]], align 2
-// CHECK-NEXT:    [[_MSPROP31:%.*]] = sext i16 [[_MSLD30]] to i32
-// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP116]] to i32
-// CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP121:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP122:%.*]] = xor i64 [[TMP121]], 193514046488576
-// CHECK-NEXT:    [[TMP123:%.*]] = inttoptr i64 [[TMP122]] to ptr
-// CHECK-NEXT:    [[_MSLD32:%.*]] = load i32, ptr [[TMP123]], align 4
-// CHECK-NEXT:    [[_MSPROP33:%.*]] = or i32 [[_MSLD32]], [[_MSPROP31]]
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP120]], [[CONV]]
-// CHECK-NEXT:    [[TMP124:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP125:%.*]] = xor i64 [[TMP124]], 193514046488576
-// CHECK-NEXT:    [[TMP126:%.*]] = inttoptr i64 [[TMP125]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP33]], ptr [[TMP126]], align 4
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP128:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP129:%.*]] = xor i64 [[TMP128]], 193514046488576
-// CHECK-NEXT:    [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
-// CHECK-NEXT:    [[_MSLD34:%.*]] = load i32, ptr [[TMP130]], align 4
-// CHECK-NEXT:    [[_MSPROP35:%.*]] = or i32 [[_MSLD34]], 0
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP127]], 1
-// CHECK-NEXT:    [[TMP131:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP132:%.*]] = xor i64 [[TMP131]], 193514046488576
-// CHECK-NEXT:    [[TMP133:%.*]] = inttoptr i64 [[TMP132]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP35]], ptr [[TMP133]], align 4
-// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK:       for.end:
-// CHECK-NEXT:    [[TMP134:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP135:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP136:%.*]] = xor i64 [[TMP135]], 193514046488576
-// CHECK-NEXT:    [[TMP137:%.*]] = inttoptr i64 [[TMP136]] to ptr
-// CHECK-NEXT:    [[_MSLD36:%.*]] = load i32, ptr [[TMP137]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[DST2]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[VEC2]]) #[[ATTR4]]
-// CHECK-NEXT:    [[_MSCMP39:%.*]] = icmp ne i32 [[_MSLD36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP39]], label [[TMP138:%.*]], label [[TMP139:%.*]], !prof [[PROF2]]
-// CHECK:       138:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       139:
-// CHECK-NEXT:    ret i32 [[TMP134]]
-//
-int test_vst2(void) {
-  int16x8x2_t vec2;
-  vec2.val[1] = vdupq_n_s16(16);
-  int16_t dst2[8*2];
-  vst2q_s16(dst2, vec2);
-
-  __msan_print_shadow(dst2, sizeof(int16_t)*8*2);
-
-  int sum = 0;
-  for (int i = 0; i < 8*2; i++)
-    sum += dst2[i];
-
-  return sum;
-}
-
-// CHECK-LABEL: define dso_local noundef i32 @test_vst3(
-// CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
-// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[VEC3:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[DST3:%.*]] = alloca [24 x i16], align 2
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[VEC3]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC3]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
-// CHECK-NEXT:    store i16 17, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
-// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
-// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load i16, ptr [[TMP22]], align 2
-// CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD9]], i32 1
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
-// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
-// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP26]], align 2
-// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP10]], i16 [[_MSLD11]], i32 2
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
-// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP30]], align 2
-// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 3
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
-// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP34]], align 2
-// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 4
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
-// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP38]], align 2
-// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 5
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
-// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP42]], align 2
-// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 6
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
-// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
-// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
-// CHECK-NEXT:    [[_MSLD21:%.*]] = load i16, ptr [[TMP46]], align 2
-// CHECK-NEXT:    [[_MSPROP22:%.*]] = insertelement <8 x i16> [[_MSPROP20]], i16 [[_MSLD21]], i32 7
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
-// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
-// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSPROP22]], ptr [[TMP49]], align 16
-// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
-// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
-// CHECK-NEXT:    [[_MSLD23:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
-// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD23]], ptr [[TMP56]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
-// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
-// CHECK-NEXT:    [[_MSLD24:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[VEC3]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
-// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD24]], ptr [[TMP63]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[DST3]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST3]] to i64
-// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
-// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
-// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC3]], i64 48)
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 0
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
-// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
-// CHECK-NEXT:    [[_MSLD25:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
-// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD25]] to <16 x i8>
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
-// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
-// CHECK-NEXT:    [[_MSLD26:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL5]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP83:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP84:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP85:%.*]] = xor i64 [[TMP84]], 193514046488576
-// CHECK-NEXT:    [[TMP86:%.*]] = inttoptr i64 [[TMP85]] to ptr
-// CHECK-NEXT:    [[_MSLD27:%.*]] = load <8 x i16>, ptr [[TMP86]], align 16
-// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[_MSLD27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP83]] to <16 x i8>
-// CHECK-NEXT:    [[TMP89:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
-// CHECK-NEXT:    [[TMP90:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
-// CHECK-NEXT:    [[TMP91:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
-// CHECK-NEXT:    [[TMP92:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
-// CHECK-NEXT:    [[TMP93:%.*]] = bitcast <16 x i8> [[TMP87]] to <8 x i16>
-// CHECK-NEXT:    [[TMP94:%.*]] = bitcast <16 x i8> [[TMP88]] to <8 x i16>
-// CHECK-NEXT:    [[TMP95:%.*]] = bitcast <8 x i16> [[TMP89]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP95]], 0
-// CHECK-NEXT:    [[TMP96:%.*]] = bitcast <8 x i16> [[TMP91]] to i128
-// CHECK-NEXT:    [[_MSCMP40:%.*]] = icmp ne i128 [[TMP96]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP40]]
-// CHECK-NEXT:    [[TMP97:%.*]] = bitcast <8 x i16> [[TMP93]] to i128
-// CHECK-NEXT:    [[_MSCMP41:%.*]] = icmp ne i128 [[TMP97]], 0
-// CHECK-NEXT:    [[_MSOR42:%.*]] = or i1 [[_MSOR]], [[_MSCMP41]]
-// CHECK-NEXT:    br i1 [[_MSOR42]], label [[TMP98:%.*]], label [[TMP99:%.*]], !prof [[PROF2]]
-// CHECK:       98:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       99:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP90]], <8 x i16> [[TMP92]], <8 x i16> [[TMP94]], ptr [[ARRAYDECAY]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 0
-// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY7]], i64 noundef 48)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP100:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP101:%.*]] = xor i64 [[TMP100]], 193514046488576
-// CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP102]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP103:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP104:%.*]] = xor i64 [[TMP103]], 193514046488576
-// CHECK-NEXT:    [[TMP105:%.*]] = inttoptr i64 [[TMP104]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP105]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP106:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP107:%.*]] = xor i64 [[TMP106]], 193514046488576
-// CHECK-NEXT:    [[TMP108:%.*]] = inttoptr i64 [[TMP107]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP108]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP109:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP110:%.*]] = xor i64 [[TMP109]], 193514046488576
-// CHECK-NEXT:    [[TMP111:%.*]] = inttoptr i64 [[TMP110]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP111]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP112:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP113:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP114:%.*]] = xor i64 [[TMP113]], 193514046488576
-// CHECK-NEXT:    [[TMP115:%.*]] = inttoptr i64 [[TMP114]] to ptr
-// CHECK-NEXT:    [[_MSLD28:%.*]] = load i32, ptr [[TMP115]], align 4
-// CHECK-NEXT:    [[_MSPROP29:%.*]] = or i32 [[_MSLD28]], 0
-// CHECK-NEXT:    [[TMP116:%.*]] = icmp ne i32 [[_MSPROP29]], 0
-// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP112]], 24
-// CHECK-NEXT:    br i1 [[TMP116]], label [[TMP117:%.*]], label [[TMP118:%.*]], !prof [[PROF2]]
-// CHECK:       117:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       118:
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    [[TMP119:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP121:%.*]] = xor i64 [[TMP120]], 193514046488576
-// CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
-// CHECK-NEXT:    [[_MSLD30:%.*]] = load i32, ptr [[TMP122]], align 4
-// CHECK-NEXT:    [[_MSPROP31:%.*]] = sext i32 [[_MSLD30]] to i64
-// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP119]] to i64
-// CHECK-NEXT:    [[_MSPROP32:%.*]] = or i64 0, [[_MSPROP31]]
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [24 x i16], ptr [[DST3]], i64 0, i64 [[IDXPROM]]
-// CHECK-NEXT:    [[_MSCMP43:%.*]] = icmp ne i64 [[_MSPROP32]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP43]], label [[TMP123:%.*]], label [[TMP124:%.*]], !prof [[PROF2]]
-// CHECK:       123:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       124:
-// CHECK-NEXT:    [[TMP125:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
-// CHECK-NEXT:    [[TMP126:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
-// CHECK-NEXT:    [[TMP127:%.*]] = xor i64 [[TMP126]], 193514046488576
-// CHECK-NEXT:    [[TMP128:%.*]] = inttoptr i64 [[TMP127]] to ptr
-// CHECK-NEXT:    [[_MSLD33:%.*]] = load i16, ptr [[TMP128]], align 2
-// CHECK-NEXT:    [[_MSPROP34:%.*]] = sext i16 [[_MSLD33]] to i32
-// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP125]] to i32
-// CHECK-NEXT:    [[TMP129:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP130:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP131:%.*]] = xor i64 [[TMP130]], 193514046488576
-// CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
-// CHECK-NEXT:    [[_MSLD35:%.*]] = load i32, ptr [[TMP132]], align 4
-// CHECK-NEXT:    [[_MSPROP36:%.*]] = or i32 [[_MSLD35]], [[_MSPROP34]]
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP129]], [[CONV]]
-// CHECK-NEXT:    [[TMP133:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP134:%.*]] = xor i64 [[TMP133]], 193514046488576
-// CHECK-NEXT:    [[TMP135:%.*]] = inttoptr i64 [[TMP134]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP36]], ptr [[TMP135]], align 4
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP137:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP138:%.*]] = xor i64 [[TMP137]], 193514046488576
-// CHECK-NEXT:    [[TMP139:%.*]] = inttoptr i64 [[TMP138]] to ptr
-// CHECK-NEXT:    [[_MSLD37:%.*]] = load i32, ptr [[TMP139]], align 4
-// CHECK-NEXT:    [[_MSPROP38:%.*]] = or i32 [[_MSLD37]], 0
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP136]], 1
-// CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP141:%.*]] = xor i64 [[TMP140]], 193514046488576
-// CHECK-NEXT:    [[TMP142:%.*]] = inttoptr i64 [[TMP141]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP38]], ptr [[TMP142]], align 4
-// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
-// CHECK:       for.end:
-// CHECK-NEXT:    [[TMP143:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP144:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP145:%.*]] = xor i64 [[TMP144]], 193514046488576
-// CHECK-NEXT:    [[TMP146:%.*]] = inttoptr i64 [[TMP145]] to ptr
-// CHECK-NEXT:    [[_MSLD39:%.*]] = load i32, ptr [[TMP146]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[DST3]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[VEC3]]) #[[ATTR4]]
-// CHECK-NEXT:    [[_MSCMP44:%.*]] = icmp ne i32 [[_MSLD39]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP44]], label [[TMP147:%.*]], label [[TMP148:%.*]], !prof [[PROF2]]
-// CHECK:       147:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       148:
-// CHECK-NEXT:    ret i32 [[TMP143]]
-//
-int test_vst3(void) {
-  int16x8x3_t vec3;
-  vec3.val[1] = vdupq_n_s16(17);
-  int16_t dst3[8*3];
-  vst3q_s16(dst3, vec3);
-
-  __msan_print_shadow(dst3, sizeof(int16_t)*8*3);
-
-  int sum = 0;
-  for (int i = 0; i < 8*3; i++)
-    sum += dst3[i];
-
-  return sum;
-}
-
-// CHECK-LABEL: define dso_local noundef i32 @test_vst4(
-// CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[__P0_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP2]], i8 -1, i64 2, i1 false)
-// CHECK-NEXT:    [[__RET_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[VEC4:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[DST4:%.*]] = alloca [32 x i16], align 2
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[SUM:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[VEC4]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VEC4]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP8]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store i16 0, ptr [[TMP11]], align 2
-// CHECK-NEXT:    store i16 18, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP18]], align 2
-// CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, i16 [[_MSLD]], i32 0
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP15]], i32 0
-// CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 193514046488576
-// CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK-NEXT:    [[_MSLD11:%.*]] = load i16, ptr [[TMP22]], align 2
-// CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD11]], i32 1
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP19]], i32 1
-// CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 193514046488576
-// CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-// CHECK-NEXT:    [[_MSLD13:%.*]] = load i16, ptr [[TMP26]], align 2
-// CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <8 x i16> [[_MSPROP12]], i16 [[_MSLD13]], i32 2
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP23]], i32 2
-// CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD15:%.*]] = load i16, ptr [[TMP30]], align 2
-// CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <8 x i16> [[_MSPROP14]], i16 [[_MSLD15]], i32 3
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP27]], i32 3
-// CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    [[_MSLD17:%.*]] = load i16, ptr [[TMP34]], align 2
-// CHECK-NEXT:    [[_MSPROP18:%.*]] = insertelement <8 x i16> [[_MSPROP16]], i16 [[_MSLD17]], i32 4
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP31]], i32 4
-// CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    [[_MSLD19:%.*]] = load i16, ptr [[TMP38]], align 2
-// CHECK-NEXT:    [[_MSPROP20:%.*]] = insertelement <8 x i16> [[_MSPROP18]], i16 [[_MSLD19]], i32 5
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP35]], i32 5
-// CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD21:%.*]] = load i16, ptr [[TMP42]], align 2
-// CHECK-NEXT:    [[_MSPROP22:%.*]] = insertelement <8 x i16> [[_MSPROP20]], i16 [[_MSLD21]], i32 6
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP39]], i32 6
-// CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__P0_ADDR_I]], align 2
-// CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[__P0_ADDR_I]] to i64
-// CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 193514046488576
-// CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
-// CHECK-NEXT:    [[_MSLD23:%.*]] = load i16, ptr [[TMP46]], align 2
-// CHECK-NEXT:    [[_MSPROP24:%.*]] = insertelement <8 x i16> [[_MSPROP22]], i16 [[_MSLD23]], i32 7
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP43]], i32 7
-// CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP47]], 193514046488576
-// CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSPROP24]], ptr [[TMP49]], align 16
-// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP50:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
-// CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[DOTCOMPOUNDLITERAL_I]] to i64
-// CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 193514046488576
-// CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
-// CHECK-NEXT:    [[_MSLD25:%.*]] = load <8 x i16>, ptr [[TMP53]], align 16
-// CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 193514046488576
-// CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD25]], ptr [[TMP56]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP50]], ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP57:%.*]] = load <8 x i16>, ptr [[__RET_I]], align 16
-// CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[__RET_I]] to i64
-// CHECK-NEXT:    [[TMP59:%.*]] = xor i64 [[TMP58]], 193514046488576
-// CHECK-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
-// CHECK-NEXT:    [[_MSLD26:%.*]] = load <8 x i16>, ptr [[TMP60]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET_I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[VEC4]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP61:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP62:%.*]] = xor i64 [[TMP61]], 193514046488576
-// CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD26]], ptr [[TMP63]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP57]], ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[DST4]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[DST4]] to i64
-// CHECK-NEXT:    [[TMP65:%.*]] = xor i64 [[TMP64]], 193514046488576
-// CHECK-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 2 [[TMP66]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP68:%.*]] = xor i64 [[TMP67]], 193514046488576
-// CHECK-NEXT:    [[TMP69:%.*]] = inttoptr i64 [[TMP68]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP69]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP70:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[VEC4]], i64 64)
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 0
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP71:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP72:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[TMP72]], 193514046488576
-// CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
-// CHECK-NEXT:    [[_MSLD27:%.*]] = load <8 x i16>, ptr [[TMP74]], align 16
-// CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i16> [[_MSLD27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i16> [[TMP71]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP78:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP79:%.*]] = xor i64 [[TMP78]], 193514046488576
-// CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
-// CHECK-NEXT:    [[_MSLD28:%.*]] = load <8 x i16>, ptr [[TMP80]], align 16
-// CHECK-NEXT:    [[TMP81:%.*]] = bitcast <8 x i16> [[_MSLD28]] to <16 x i8>
-// CHECK-NEXT:    [[TMP82:%.*]] = bitcast <8 x i16> [[TMP77]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP83:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP84:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP85:%.*]] = xor i64 [[TMP84]], 193514046488576
-// CHECK-NEXT:    [[TMP86:%.*]] = inttoptr i64 [[TMP85]] to ptr
-// CHECK-NEXT:    [[_MSLD29:%.*]] = load <8 x i16>, ptr [[TMP86]], align 16
-// CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i16> [[_MSLD29]] to <16 x i8>
-// CHECK-NEXT:    [[TMP88:%.*]] = bitcast <8 x i16> [[TMP83]] to <16 x i8>
-// CHECK-NEXT:    [[VAL7:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL7]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP89:%.*]] = load <8 x i16>, ptr [[ARRAYIDX8]], align 16
-// CHECK-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
-// CHECK-NEXT:    [[TMP91:%.*]] = xor i64 [[TMP90]], 193514046488576
-// CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
-// CHECK-NEXT:    [[_MSLD30:%.*]] = load <8 x i16>, ptr [[TMP92]], align 16
-// CHECK-NEXT:    [[TMP93:%.*]] = bitcast <8 x i16> [[_MSLD30]] to <16 x i8>
-// CHECK-NEXT:    [[TMP94:%.*]] = bitcast <8 x i16> [[TMP89]] to <16 x i8>
-// CHECK-NEXT:    [[TMP95:%.*]] = bitcast <16 x i8> [[TMP75]] to <8 x i16>
-// CHECK-NEXT:    [[TMP96:%.*]] = bitcast <16 x i8> [[TMP76]] to <8 x i16>
-// CHECK-NEXT:    [[TMP97:%.*]] = bitcast <16 x i8> [[TMP81]] to <8 x i16>
-// CHECK-NEXT:    [[TMP98:%.*]] = bitcast <16 x i8> [[TMP82]] to <8 x i16>
-// CHECK-NEXT:    [[TMP99:%.*]] = bitcast <16 x i8> [[TMP87]] to <8 x i16>
-// CHECK-NEXT:    [[TMP100:%.*]] = bitcast <16 x i8> [[TMP88]] to <8 x i16>
-// CHECK-NEXT:    [[TMP101:%.*]] = bitcast <16 x i8> [[TMP93]] to <8 x i16>
-// CHECK-NEXT:    [[TMP102:%.*]] = bitcast <16 x i8> [[TMP94]] to <8 x i16>
-// CHECK-NEXT:    [[TMP103:%.*]] = bitcast <8 x i16> [[TMP95]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP103]], 0
-// CHECK-NEXT:    [[TMP104:%.*]] = bitcast <8 x i16> [[TMP97]] to i128
-// CHECK-NEXT:    [[_MSCMP43:%.*]] = icmp ne i128 [[TMP104]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP43]]
-// CHECK-NEXT:    [[TMP105:%.*]] = bitcast <8 x i16> [[TMP99]] to i128
-// CHECK-NEXT:    [[_MSCMP44:%.*]] = icmp ne i128 [[TMP105]], 0
-// CHECK-NEXT:    [[_MSOR45:%.*]] = or i1 [[_MSOR]], [[_MSCMP44]]
-// CHECK-NEXT:    [[TMP106:%.*]] = bitcast <8 x i16> [[TMP101]] to i128
-// CHECK-NEXT:    [[_MSCMP46:%.*]] = icmp ne i128 [[TMP106]], 0
-// CHECK-NEXT:    [[_MSOR47:%.*]] = or i1 [[_MSOR45]], [[_MSCMP46]]
-// CHECK-NEXT:    br i1 [[_MSOR47]], label [[TMP107:%.*]], label [[TMP108:%.*]], !prof [[PROF2]]
-// CHECK:       107:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       108:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP96]], <8 x i16> [[TMP98]], <8 x i16> [[TMP100]], <8 x i16> [[TMP102]], ptr [[ARRAYDECAY]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 0
-// CHECK-NEXT:    call void @__msan_print_shadow(ptr noundef [[ARRAYDECAY9]], i64 noundef 64)
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP109:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP110:%.*]] = xor i64 [[TMP109]], 193514046488576
-// CHECK-NEXT:    [[TMP111:%.*]] = inttoptr i64 [[TMP110]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP111]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP112:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP113:%.*]] = xor i64 [[TMP112]], 193514046488576
-// CHECK-NEXT:    [[TMP114:%.*]] = inttoptr i64 [[TMP113]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP114]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[SUM]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP115:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP116:%.*]] = xor i64 [[TMP115]], 193514046488576
-// CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP117]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP119:%.*]] = xor i64 [[TMP118]], 193514046488576
-// CHECK-NEXT:    [[TMP120:%.*]] = inttoptr i64 [[TMP119]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP120]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP122:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP123:%.*]] = xor i64 [[TMP122]], 193514046488576
-// CHECK-NEXT:    [[TMP124:%.*]] = inttoptr i64 [[TMP123]] to ptr
-// CHECK-NEXT:    [[_MSLD31:%.*]] = load i32, ptr [[TMP124]], align 4
-// CHECK-NEXT:    [[_MSPROP32:%.*]] = or i32 [[_MSLD31]], 0
-// CHECK-NEXT:    [[TMP125:%.*]] = icmp ne i32 [[_MSPROP32]], 0
-// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP121]], 32
-// CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF2]]
-// CHECK:       126:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       127:
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-// CHECK:       for.cond.cleanup:
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK-NEXT:    br label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    [[TMP128:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP129:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP130:%.*]] = xor i64 [[TMP129]], 193514046488576
-// CHECK-NEXT:    [[TMP131:%.*]] = inttoptr i64 [[TMP130]] to ptr
-// CHECK-NEXT:    [[_MSLD33:%.*]] = load i32, ptr [[TMP131]], align 4
-// CHECK-NEXT:    [[_MSPROP34:%.*]] = sext i32 [[_MSLD33]] to i64
-// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP128]] to i64
-// CHECK-NEXT:    [[_MSPROP35:%.*]] = or i64 0, [[_MSPROP34]]
-// CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [32 x i16], ptr [[DST4]], i64 0, i64 [[IDXPROM]]
-// CHECK-NEXT:    [[_MSCMP48:%.*]] = icmp ne i64 [[_MSPROP35]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP48]], label [[TMP132:%.*]], label [[TMP133:%.*]], !prof [[PROF2]]
-// CHECK:       132:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       133:
-// CHECK-NEXT:    [[TMP134:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-// CHECK-NEXT:    [[TMP135:%.*]] = ptrtoint ptr [[ARRAYIDX10]] to i64
-// CHECK-NEXT:    [[TMP136:%.*]] = xor i64 [[TMP135]], 193514046488576
-// CHECK-NEXT:    [[TMP137:%.*]] = inttoptr i64 [[TMP136]] to ptr
-// CHECK-NEXT:    [[_MSLD36:%.*]] = load i16, ptr [[TMP137]], align 2
-// CHECK-NEXT:    [[_MSPROP37:%.*]] = sext i16 [[_MSLD36]] to i32
-// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP134]] to i32
-// CHECK-NEXT:    [[TMP138:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP139:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP140:%.*]] = xor i64 [[TMP139]], 193514046488576
-// CHECK-NEXT:    [[TMP141:%.*]] = inttoptr i64 [[TMP140]] to ptr
-// CHECK-NEXT:    [[_MSLD38:%.*]] = load i32, ptr [[TMP141]], align 4
-// CHECK-NEXT:    [[_MSPROP39:%.*]] = or i32 [[_MSLD38]], [[_MSPROP37]]
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP138]], [[CONV]]
-// CHECK-NEXT:    [[TMP142:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP143:%.*]] = xor i64 [[TMP142]], 193514046488576
-// CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP39]], ptr [[TMP144]], align 4
-// CHECK-NEXT:    store i32 [[ADD]], ptr [[SUM]], align 4
-// CHECK-NEXT:    br label [[FOR_INC:%.*]]
-// CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP145:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP147:%.*]] = xor i64 [[TMP146]], 193514046488576
-// CHECK-NEXT:    [[TMP148:%.*]] = inttoptr i64 [[TMP147]] to ptr
-// CHECK-NEXT:    [[_MSLD40:%.*]] = load i32, ptr [[TMP148]], align 4
-// CHECK-NEXT:    [[_MSPROP41:%.*]] = or i32 [[_MSLD40]], 0
-// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP145]], 1
-// CHECK-NEXT:    [[TMP149:%.*]] = ptrtoint ptr [[I]] to i64
-// CHECK-NEXT:    [[TMP150:%.*]] = xor i64 [[TMP149]], 193514046488576
-// CHECK-NEXT:    [[TMP151:%.*]] = inttoptr i64 [[TMP150]] to ptr
-// CHECK-NEXT:    store i32 [[_MSPROP41]], ptr [[TMP151]], align 4
-// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
-// CHECK:       for.end:
-// CHECK-NEXT:    [[TMP152:%.*]] = load i32, ptr [[SUM]], align 4
-// CHECK-NEXT:    [[TMP153:%.*]] = ptrtoint ptr [[SUM]] to i64
-// CHECK-NEXT:    [[TMP154:%.*]] = xor i64 [[TMP153]], 193514046488576
-// CHECK-NEXT:    [[TMP155:%.*]] = inttoptr i64 [[TMP154]] to ptr
-// CHECK-NEXT:    [[_MSLD42:%.*]] = load i32, ptr [[TMP155]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[SUM]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[DST4]]) #[[ATTR4]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[VEC4]]) #[[ATTR4]]
-// CHECK-NEXT:    [[_MSCMP49:%.*]] = icmp ne i32 [[_MSLD42]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP49]], label [[TMP156:%.*]], label [[TMP157:%.*]], !prof [[PROF2]]
-// CHECK:       156:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       157:
-// CHECK-NEXT:    ret i32 [[TMP152]]
-//
-int test_vst4(void) {
-  int16x8x4_t vec4;
-  vec4.val[2] = vdupq_n_s16(18);
-  int16_t dst4[8*4];
-  vst4q_s16(dst4, vec4);
-
-  __msan_print_shadow(dst4, sizeof(int16_t)*8*4);
-
-  int sum = 0;
-  for (int i = 0; i < 8*4; i++)
-    sum += dst4[i];
-
-  return sum;
-}
-
-// CHECK-LABEL: define dso_local noundef i32 @main(
-// CHECK-SAME: i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP2]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARGC_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP5]], i8 -1, i64 4, i1 false)
-// CHECK-NEXT:    [[ARGV_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARGV_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 4
-// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[ARGC_ADDR]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 4
-// CHECK-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGV_ADDR]] to i64
-// CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 193514046488576
-// CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP17]], align 8
-// CHECK-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @test_vst1()
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @test_vst2()
-// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @test_vst3()
-// CHECK-NEXT:    [[CALL3:%.*]] = call noundef i32 @test_vst4()
-// CHECK-NEXT:    ret i32 0
-//
-int main (int argc, char* argv[]) {
-    test_vst1();
-    test_vst2();
-    test_vst3();
-    test_vst4();
-
-    return 0;
-}
-//.
-// CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
-// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"}
-// CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
-// CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
-// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
-//.
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c b/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c
deleted file mode 100644
index 5f042b10a0c8e..0000000000000
--- a/clang/test/CodeGen/aarch64-neon-intrinsics-msan.c
+++ /dev/null
@@ -1,18071 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S \
-// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s -fsanitize=memory \
-// RUN: | FileCheck %s
-
-// REQUIRES: aarch64-registered-target || arm-registered-target
-
-// Forked from aarch64-neon-intrinsics.c
-
-#include <arm_neon.h>
-
-// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4:[0-9]+]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2:![0-9]+]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
-//
-uint8x16_t test_vld1q_u8(uint8_t const *a) {
-  return vld1q_u8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
-//
-uint16x8_t test_vld1q_u16(uint16_t const *a) {
-  return vld1q_u16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i32> @test_vld1q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i32> [[TMP32]]
-//
-uint32x4_t test_vld1q_u32(uint32_t const *a) {
-  return vld1q_u32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i64> @test_vld1q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i64> [[TMP32]]
-//
-uint64x2_t test_vld1q_u64(uint64_t const *a) {
-  return vld1q_u64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
-//
-int8x16_t test_vld1q_s8(int8_t const *a) {
-  return vld1q_s8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
-//
-int16x8_t test_vld1q_s16(int16_t const *a) {
-  return vld1q_s16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i32> @test_vld1q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i32> [[TMP32]]
-//
-int32x4_t test_vld1q_s32(int32_t const *a) {
-  return vld1q_s32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i64> @test_vld1q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i64> [[TMP32]]
-//
-int64x2_t test_vld1q_s64(int64_t const *a) {
-  return vld1q_s64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x half> @test_vld1q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x half>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <8 x half> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x half>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <8 x half> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x half>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x half> [[TMP32]]
-//
-float16x8_t test_vld1q_f16(float16_t const *a) {
-  return vld1q_f16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x float> @test_vld1q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x float>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x float>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x float> [[TMP32]]
-//
-float32x4_t test_vld1q_f32(float32_t const *a) {
-  return vld1q_f32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x double> @test_vld1q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x double>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i64> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x double> [[TMP32]]
-//
-float64x2_t test_vld1q_f64(float64_t const *a) {
-  return vld1q_f64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <16 x i8> @test_vld1q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <16 x i8> [[TMP32]]
-//
-poly8x16_t test_vld1q_p8(poly8_t const *a) {
-  return vld1q_p8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i16> @test_vld1q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP24]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP18]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i16>, ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD2]], ptr [[TMP31]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP25]], ptr [[TMP]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i16>, ptr [[TMP]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP35]], align 16
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i16> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i16> [[TMP32]]
-//
-poly16x8_t test_vld1q_p16(poly16_t const *a) {
-  return vld1q_p16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-uint8x8_t test_vld1_u8(uint8_t const *a) {
-  return vld1_u8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-uint16x4_t test_vld1_u16(uint16_t const *a) {
-  return vld1_u16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
-//
-uint32x2_t test_vld1_u32(uint32_t const *a) {
-  return vld1_u32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
-//
-uint64x1_t test_vld1_u64(uint64_t const *a) {
-  return vld1_u64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-int8x8_t test_vld1_s8(int8_t const *a) {
-  return vld1_s8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-int16x4_t test_vld1_s16(int16_t const *a) {
-  return vld1_s16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
-//
-int32x2_t test_vld1_s32(int32_t const *a) {
-  return vld1_s32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
-//
-int64x1_t test_vld1_s64(int64_t const *a) {
-  return vld1_s64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x half> @test_vld1_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x half>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x half> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x half>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x half> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x half>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x half> [[TMP32]]
-//
-float16x4_t test_vld1_f16(float16_t const *a) {
-  return vld1_f16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x float> @test_vld1_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x float>, ptr [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x float> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x float> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x float>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x float> [[TMP32]]
-//
-float32x2_t test_vld1_f32(float32_t const *a) {
-  return vld1_f32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x double> @test_vld1_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x double>, ptr [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x double>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x double>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x double> [[TMP32]]
-//
-float64x1_t test_vld1_f64(float64_t const *a) {
-  return vld1_f64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-poly8x8_t test_vld1_p8(poly8_t const *a) {
-  return vld1_p8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 2
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-poly16x4_t test_vld1_p16(poly16_t const *a) {
-  return vld1_p16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_u8_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-uint8x8_t test_vld1_u8_void(void *a) {
-  return vld1_u8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_u16_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-uint16x4_t test_vld1_u16_void(void *a) {
-  return vld1_u16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_u32_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
-//
-uint32x2_t test_vld1_u32_void(void *a) {
-  return vld1_u32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_u64_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
-//
-uint64x1_t test_vld1_u64_void(void *a) {
-  return vld1_u64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_s8_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-int8x8_t test_vld1_s8_void(void *a) {
-  return vld1_s8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_s16_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-int16x4_t test_vld1_s16_void(void *a) {
-  return vld1_s16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x i32> @test_vld1_s32_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x i32> [[TMP32]]
-//
-int32x2_t test_vld1_s32_void(void *a) {
-  return vld1_s32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x i64> @test_vld1_s64_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x i64>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x i64>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x i64>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x i64> [[TMP32]]
-//
-int64x1_t test_vld1_s64_void(void *a) {
-  return vld1_s64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x half> @test_vld1_f16_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x half>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x half> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x half>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x half> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x half>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x half> [[TMP32]]
-//
-float16x4_t test_vld1_f16_void(void *a) {
-  return vld1_f16(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <2 x float> @test_vld1_f32_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x float>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <2 x float> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <2 x float> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x float>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x i32> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <2 x float> [[TMP32]]
-//
-float32x2_t test_vld1_f32_void(void *a) {
-  return vld1_f32(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <1 x double> @test_vld1_f64_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <1 x double>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <1 x double>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <1 x double>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <1 x i64> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <1 x double> [[TMP32]]
-//
-float64x1_t test_vld1_f64_void(void *a) {
-  return vld1_f64(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <8 x i8> @test_vld1_p8_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x i8>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <8 x i8> [[TMP32]]
-//
-poly8x8_t test_vld1_p8_void(void *a) {
-  return vld1_p8(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x i16> @test_vld1_p16_void(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i16>, ptr [[TMP12]], align 1
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP12]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP21]], align 1
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD1]], ptr [[TMP24]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP18]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i16>, ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 193514046488576
-// CHECK-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD2]], ptr [[TMP31]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP25]], ptr [[TMP]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i16>, ptr [[TMP]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP]] to i64
-// CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 193514046488576
-// CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP35]], align 8
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i16> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP36]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP37:%.*]], label [[TMP38:%.*]], !prof [[PROF2]]
-// CHECK:       37:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       38:
-// CHECK-NEXT:    ret <4 x i16> [[TMP32]]
-//
-poly16x4_t test_vld1_p16_void(void *a) {
-  return vld1_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP22]]
-//
-uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
-  return vld2q_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP22]]
-//
-uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
-  return vld2q_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP22]]
-//
-uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
-  return vld2q_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP22]]
-//
-uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
-  return vld2q_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP22]]
-//
-int8x16x2_t test_vld2q_s8(int8_t const *a) {
-  return vld2q_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP22]]
-//
-int16x8x2_t test_vld2q_s16(int16_t const *a) {
-  return vld2q_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP22]]
-//
-int32x4x2_t test_vld2q_s32(int32_t const *a) {
-  return vld2q_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP22]]
-//
-int64x2x2_t test_vld2q_s64(int64_t const *a) {
-  return vld2q_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP22]]
-//
-float16x8x2_t test_vld2q_f16(float16_t const *a) {
-  return vld2q_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP22]]
-//
-float32x4x2_t test_vld2q_f32(float32_t const *a) {
-  return vld2q_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP22]]
-//
-float64x2x2_t test_vld2q_f64(float64_t const *a) {
-  return vld2q_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP22]]
-//
-poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
-  return vld2q_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP22]]
-//
-poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
-  return vld2q_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP22]]
-//
-uint8x8x2_t test_vld2_u8(uint8_t const *a) {
-  return vld2_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP22]]
-//
-uint16x4x2_t test_vld2_u16(uint16_t const *a) {
-  return vld2_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP22]]
-//
-uint32x2x2_t test_vld2_u32(uint32_t const *a) {
-  return vld2_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X1X2_T]] [[TMP22]]
-//
-uint64x1x2_t test_vld2_u64(uint64_t const *a) {
-  return vld2_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP22]]
-//
-int8x8x2_t test_vld2_s8(int8_t const *a) {
-  return vld2_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP22]]
-//
-int16x4x2_t test_vld2_s16(int16_t const *a) {
-  return vld2_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP22]]
-//
-int32x2x2_t test_vld2_s32(int32_t const *a) {
-  return vld2_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X1X2_T]] [[TMP22]]
-//
-int64x1x2_t test_vld2_s64(int64_t const *a) {
-  return vld2_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP22]]
-//
-float16x4x2_t test_vld2_f16(float16_t const *a) {
-  return vld2_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP22]]
-//
-float32x2x2_t test_vld2_f32(float32_t const *a) {
-  return vld2_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP22]]
-//
-float64x1x2_t test_vld2_f64(float64_t const *a) {
-  return vld2_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP22]]
-//
-poly8x8x2_t test_vld2_p8(poly8_t const *a) {
-  return vld2_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP22]]
-//
-poly16x4x2_t test_vld2_p16(poly16_t const *a) {
-  return vld2_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X16X3_T]] [[TMP22]]
-//
-uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
-  return vld3q_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X8X3_T]] [[TMP22]]
-//
-uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
-  return vld3q_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X4X3_T]] [[TMP22]]
-//
-uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
-  return vld3q_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP22]]
-//
-uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
-  return vld3q_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X16X3_T]] [[TMP22]]
-//
-int8x16x3_t test_vld3q_s8(int8_t const *a) {
-  return vld3q_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X8X3_T]] [[TMP22]]
-//
-int16x8x3_t test_vld3q_s16(int16_t const *a) {
-  return vld3q_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X4X3_T]] [[TMP22]]
-//
-int32x4x3_t test_vld3q_s32(int32_t const *a) {
-  return vld3q_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP22]]
-//
-int64x2x3_t test_vld3q_s64(int64_t const *a) {
-  return vld3q_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X3_T]] [[TMP22]]
-//
-float16x8x3_t test_vld3q_f16(float16_t const *a) {
-  return vld3q_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X3_T]] [[TMP22]]
-//
-float32x4x3_t test_vld3q_f32(float32_t const *a) {
-  return vld3q_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP22]]
-//
-float64x2x3_t test_vld3q_f64(float64_t const *a) {
-  return vld3q_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X16X3_T]] [[TMP22]]
-//
-poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
-  return vld3q_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X8X3_T]] [[TMP22]]
-//
-poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
-  return vld3q_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X8X3_T]] [[TMP22]]
-//
-uint8x8x3_t test_vld3_u8(uint8_t const *a) {
-  return vld3_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X4X3_T]] [[TMP22]]
-//
-uint16x4x3_t test_vld3_u16(uint16_t const *a) {
-  return vld3_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X2X3_T]] [[TMP22]]
-//
-uint32x2x3_t test_vld3_u32(uint32_t const *a) {
-  return vld3_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X1X3_T]] [[TMP22]]
-//
-uint64x1x3_t test_vld3_u64(uint64_t const *a) {
-  return vld3_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X8X3_T]] [[TMP22]]
-//
-int8x8x3_t test_vld3_s8(int8_t const *a) {
-  return vld3_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X4X3_T]] [[TMP22]]
-//
-int16x4x3_t test_vld3_s16(int16_t const *a) {
-  return vld3_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X2X3_T]] [[TMP22]]
-//
-int32x2x3_t test_vld3_s32(int32_t const *a) {
-  return vld3_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X1X3_T]] [[TMP22]]
-//
-int64x1x3_t test_vld3_s64(int64_t const *a) {
-  return vld3_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X3_T]] [[TMP22]]
-//
-float16x4x3_t test_vld3_f16(float16_t const *a) {
-  return vld3_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X3_T]] [[TMP22]]
-//
-float32x2x3_t test_vld3_f32(float32_t const *a) {
-  return vld3_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP22]]
-//
-float64x1x3_t test_vld3_f64(float64_t const *a) {
-  return vld3_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X8X3_T]] [[TMP22]]
-//
-poly8x8x3_t test_vld3_p8(poly8_t const *a) {
-  return vld3_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X4X3_T]] [[TMP22]]
-//
-poly16x4x3_t test_vld3_p16(poly16_t const *a) {
-  return vld3_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X16X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X16X4_T]] [[TMP22]]
-//
-uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
-  return vld4q_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X8X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X8X4_T]] [[TMP22]]
-//
-uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
-  return vld4q_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X4X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X4X4_T]] [[TMP22]]
-//
-uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
-  return vld4q_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X2X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP22]]
-//
-uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
-  return vld4q_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X16X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X16X4_T]] [[TMP22]]
-//
-int8x16x4_t test_vld4q_s8(int8_t const *a) {
-  return vld4q_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X8X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X8X4_T]] [[TMP22]]
-//
-int16x8x4_t test_vld4q_s16(int16_t const *a) {
-  return vld4q_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X4X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X4X4_T]] [[TMP22]]
-//
-int32x4x4_t test_vld4q_s32(int32_t const *a) {
-  return vld4q_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X2X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP22]]
-//
-int64x2x4_t test_vld4q_s64(int64_t const *a) {
-  return vld4q_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X8X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X4_T]] [[TMP22]]
-//
-float16x8x4_t test_vld4q_f16(float16_t const *a) {
-  return vld4q_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X4X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i32>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <4 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X4_T]] [[TMP22]]
-//
-float32x4x4_t test_vld4q_f32(float32_t const *a) {
-  return vld4q_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP22]]
-//
-float64x2x4_t test_vld4q_f64(float64_t const *a) {
-  return vld4q_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X16X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <16 x i8>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <16 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X16X4_T]] [[TMP22]]
-//
-poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
-  return vld4q_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X8X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i16>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <8 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X8X4_T]] [[TMP22]]
-//
-poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
-  return vld4q_p16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT8X8X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT8X8X4_T]] [[TMP22]]
-//
-uint8x8x4_t test_vld4_u8(uint8_t const *a) {
-  return vld4_u8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT16X4X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT16X4X4_T]] [[TMP22]]
-//
-uint16x4x4_t test_vld4_u16(uint16_t const *a) {
-  return vld4_u16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT32X2X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT32X2X4_T]] [[TMP22]]
-//
-uint32x2x4_t test_vld4_u32(uint32_t const *a) {
-  return vld4_u32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_UINT64X1X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_UINT64X1X4_T]] [[TMP22]]
-//
-uint64x1x4_t test_vld4_u64(uint64_t const *a) {
-  return vld4_u64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT8X8X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT8X8X4_T]] [[TMP22]]
-//
-int8x8x4_t test_vld4_s8(int8_t const *a) {
-  return vld4_s8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT16X4X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT16X4X4_T]] [[TMP22]]
-//
-int16x4x4_t test_vld4_s16(int16_t const *a) {
-  return vld4_s16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT32X2X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT32X2X4_T]] [[TMP22]]
-//
-int32x2x4_t test_vld4_s32(int32_t const *a) {
-  return vld4_s32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_INT64X1X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_INT64X1X4_T]] [[TMP22]]
-//
-int64x1x4_t test_vld4_s64(int64_t const *a) {
-  return vld4_s64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT16X4X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X4_T]] [[TMP22]]
-//
-float16x4x4_t test_vld4_f16(float16_t const *a) {
-  return vld4_f16(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT32X2X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i32>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <2 x i32>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X4_T]] [[TMP22]]
-//
-float32x2x4_t test_vld4_f32(float32_t const *a) {
-  return vld4_f32(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP22]]
-//
-float64x1x4_t test_vld4_f64(float64_t const *a) {
-  return vld4_f64(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY8X8X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <8 x i8>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <8 x i8>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY8X8X4_T]] [[TMP22]]
-//
-poly8x8x4_t test_vld4_p8(poly8_t const *a) {
-  return vld4_p8(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY16X4X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <4 x i16>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <4 x i16>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY16X4X4_T]] [[TMP22]]
-//
-poly16x4x4_t test_vld4_p16(poly16_t const *a) {
-  return vld4_p16(a);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
-  vst1q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
-  vst1q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <4 x i32> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
-  vst1q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
-  vst1q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_s8(int8_t *a, int8x16_t b) {
-  vst1q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_s16(int16_t *a, int16x8_t b) {
-  vst1q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i32>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i32>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <4 x i32> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_s32(int32_t *a, int32x4_t b) {
-  vst1q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i64>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <2 x i64> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_s64(int64_t *a, int64x2_t b) {
-  vst1q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <8 x half> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x half>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x half> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x half>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <8 x half> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f16(float16_t *a, float16x8_t b) {
-  vst1q_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <4 x float> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x float> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <4 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x float>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <4 x float> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f32(float32_t *a, float32x4_t b) {
-  vst1q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x double>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x double> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <2 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x double>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <2 x double> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f64(float64_t *a, float64x2_t b) {
-  vst1q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <16 x i8>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i8>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <16 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <16 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
-  vst1q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP5]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP11]], align 16
-// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP14]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP18]], align 16
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP21]], align 16
-// CHECK-NEXT:    store <8 x i16> [[TMP15]], ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i16>, ptr [[TMP29]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i16> [[_MSLD2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[TMP26]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i8> [[TMP30]] to <8 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <8 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <8 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
-  vst1q_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_u8(uint8_t *a, uint8x8_t b) {
-  vst1_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_u16(uint16_t *a, uint16x4_t b) {
-  vst1_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <2 x i32> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_u32(uint32_t *a, uint32x2_t b) {
-  vst1_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_u64(uint64_t *a, uint64x1_t b) {
-  vst1_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_s8(int8_t *a, int8x8_t b) {
-  vst1_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_s16(int16_t *a, int16x4_t b) {
-  vst1_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <2 x i32> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <2 x i32> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_s32(int32_t *a, int32x2_t b) {
-  vst1_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x i64>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <1 x i64> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_s64(int64_t *a, int64x1_t b) {
-  vst1_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x half>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <4 x half> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x half>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x half> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x half>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <4 x half> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f16(float16_t *a, float16x4_t b) {
-  vst1_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <2 x float> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x float>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i32> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x float> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <2 x i32>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x float>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <2 x i32> [[TMP32]], ptr [[TMP38]], align 4
-// CHECK-NEXT:    store <2 x float> [[TMP33]], ptr [[TMP22]], align 4
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f32(float32_t *a, float32x2_t b) {
-  vst1_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <1 x double>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <1 x double> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <1 x double>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <1 x double>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <1 x i64>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <1 x i64> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x double> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <1 x i64>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x double>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <1 x i64> [[TMP32]], ptr [[TMP38]], align 8
-// CHECK-NEXT:    store <1 x double> [[TMP33]], ptr [[TMP22]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f64(float64_t *a, float64x1_t b) {
-  vst1_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <8 x i8>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <8 x i8>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
-// CHECK:       30:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       31:
-// CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 193514046488576
-// CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-// CHECK-NEXT:    store <8 x i8> [[_MSLD2]], ptr [[TMP34]], align 1
-// CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP22]], align 1
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_p8(poly8_t *a, poly8x8_t b) {
-  vst1_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP11]], align 8
-// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 193514046488576
-// CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP14]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[B_ADDR]] to i64
-// CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
-// CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP18]], align 8
-// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
-// CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[_MSLD]], ptr [[TMP21]], align 8
-// CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP25]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
-// CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 193514046488576
-// CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-// CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i16>, ptr [[TMP29]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i16> [[_MSLD2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP26]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i8> [[TMP30]] to <4 x i16>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD1]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF2]]
-// CHECK:       34:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       35:
-// CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[TMP22]] to i64
-// CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 193514046488576
-// CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-// CHECK-NEXT:    store <4 x i16> [[TMP32]], ptr [[TMP38]], align 2
-// CHECK-NEXT:    store <4 x i16> [[TMP33]], ptr [[TMP22]], align 2
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_p16(poly16_t *a, poly16x4_t b) {
-  vst1_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
-  vst2q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
-  vst2q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP34]], <4 x i32> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
-  vst2q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
-  vst2q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
-  vst2q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
-  vst2q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP34]], <4 x i32> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
-  vst2q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
-  vst2q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP34]], <8 x half> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
-  vst2q_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP34]], <4 x float> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
-  vst2q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP34]], <2 x double> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
-  vst2q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[_MSLD3]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSLD4]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
-  vst2q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP34]], <8 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
-  vst2q_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
-  vst2_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
-  vst2_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP34]], <2 x i32> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
-  vst2_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
-  vst2_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_s8(int8_t *a, int8x8x2_t b) {
-  vst2_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_s16(int16_t *a, int16x4x2_t b) {
-  vst2_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP34]], <2 x i32> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_s32(int32_t *a, int32x2x2_t b) {
-  vst2_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_s64(int64_t *a, int64x1x2_t b) {
-  vst2_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP34]], <4 x half> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_f16(float16_t *a, float16x4x2_t b) {
-  vst2_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP34]], <2 x float> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_f32(float32_t *a, float32x2x2_t b) {
-  vst2_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP34]], <1 x double> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_f64(float64_t *a, float64x1x2_t b) {
-  vst2_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i8> [[_MSLD3]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP29]], 0
-// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i8> [[_MSLD4]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP30]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP31:%.*]], label [[TMP32:%.*]], !prof [[PROF2]]
-// CHECK:       31:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       32:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
-  vst2_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP34]], <4 x i16> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
-  vst2_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
-  vst3q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
-  vst3q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP40]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
-  vst3q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
-  vst3q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
-  vst3q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
-  vst3q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP40]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
-  vst3q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
-  vst3q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x half> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x half>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP40]], <8 x half> [[TMP42]], <8 x half> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
-  vst3q_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x float> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x float>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i32> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i32> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP40]], <4 x float> [[TMP42]], <4 x float> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
-  vst3q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP40]], <2 x double> [[TMP42]], <2 x double> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
-  vst3q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSLD5]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[_MSLD6]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
-  vst3q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i16> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i16> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i16> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP40]], <8 x i16> [[TMP42]], <8 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
-  vst3q_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
-  vst3_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
-  vst3_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP40]], <2 x i32> [[TMP42]], <2 x i32> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
-  vst3_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
-  vst3_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_s8(int8_t *a, int8x8x3_t b) {
-  vst3_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_s16(int16_t *a, int16x4x3_t b) {
-  vst3_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP40]], <2 x i32> [[TMP42]], <2 x i32> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_s32(int32_t *a, int32x2x3_t b) {
-  vst3_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_s64(int64_t *a, int64x1x3_t b) {
-  vst3_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x half> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x half>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP40]], <4 x half> [[TMP42]], <4 x half> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_f16(float16_t *a, float16x4x3_t b) {
-  vst3_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x float> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x float>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP40]], <2 x float> [[TMP42]], <2 x float> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_f32(float32_t *a, float32x2x3_t b) {
-  vst3_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP40]], <1 x double> [[TMP42]], <1 x double> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_f64(float64_t *a, float64x1x3_t b) {
-  vst3_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[_MSLD5]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP33]], 0
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[_MSLD6]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP34]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP35]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP36:%.*]], label [[TMP37:%.*]], !prof [[PROF2]]
-// CHECK:       36:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       37:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
-  vst3_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <4 x i16> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <4 x i16> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i16> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP40]], <4 x i16> [[TMP42]], <4 x i16> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
-  vst3_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
-  vst4q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
-  vst4q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i32> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP46]], <4 x i32> [[TMP48]], <4 x i32> [[TMP50]], <4 x i32> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
-  vst4q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
-  vst4q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
-  vst4q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
-  vst4q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i32> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x i32>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x i32>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP46]], <4 x i32> [[TMP48]], <4 x i32> [[TMP50]], <4 x i32> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
-  vst4q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
-  vst4q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <8 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x half> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x half> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x half> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x half> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x half>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x half>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x half>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x half>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP46]], <8 x half> [[TMP48]], <8 x half> [[TMP50]], <8 x half> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
-  vst4q_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i32>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <4 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i32>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x float> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i32>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i32>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x float> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i32>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i32> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x float> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <4 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <4 x float>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <4 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <4 x float>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <4 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <4 x float>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <4 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <4 x float>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i32> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i32> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i32> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP46]], <4 x float> [[TMP48]], <4 x float> [[TMP50]], <4 x float> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
-  vst4q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x double> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x double>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP46]], <2 x double> [[TMP48]], <2 x double> [[TMP50]], <2 x double> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
-  vst4q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <16 x i8>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <16 x i8>, ptr [[TMP28]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <16 x i8>, ptr [[TMP32]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <16 x i8>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSLD7]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i8> [[_MSLD8]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[_MSLD9]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSLD10]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP21]], <16 x i8> [[TMP25]], <16 x i8> [[TMP29]], <16 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
-  vst4q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i16>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i16>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i16> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i16>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i16> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i16> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i16>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i16> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i16> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i16>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i16> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i16> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <8 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <8 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <8 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <8 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <8 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <8 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <8 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <8 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <8 x i16> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <8 x i16> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i16> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i16> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP46]], <8 x i16> [[TMP48]], <8 x i16> [[TMP50]], <8 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
-  vst4q_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_u8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
-  vst4_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_u16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
-  vst4_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_u32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x i32>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP46]], <2 x i32> [[TMP48]], <2 x i32> [[TMP50]], <2 x i32> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
-  vst4_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_u64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
-  vst4_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_s8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_s8(int8_t *a, int8x8x4_t b) {
-  vst4_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_s16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_s16(int16_t *a, int16x4x4_t b) {
-  vst4_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_s32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i32> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i32> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x i32>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x i32>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x i32>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x i32>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP46]], <2 x i32> [[TMP48]], <2 x i32> [[TMP50]], <2 x i32> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_s32(int32_t *a, int32x2x4_t b) {
-  vst4_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_s64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_s64(int64_t *a, int64x1x4_t b) {
-  vst4_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_f16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <4 x half>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x half> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x half> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x half> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x half> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x half>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x half>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x half>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x half>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP46]], <4 x half> [[TMP48]], <4 x half> [[TMP50]], <4 x half> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_f16(float16_t *a, float16x4x4_t b) {
-  vst4_f16(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_f32(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i32>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i32>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <2 x float>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i32>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i32> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x float> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i32>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x float> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i32>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x float> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i32>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i32> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x float> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <2 x i32>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <2 x float>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <2 x i32>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <2 x float>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <2 x i32>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <2 x float>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <2 x i32>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <2 x float>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i32> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP46]], <2 x float> [[TMP48]], <2 x float> [[TMP50]], <2 x float> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_f32(float32_t *a, float32x2x4_t b) {
-  vst4_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_f64(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x double> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x double>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP46]], <1 x double> [[TMP48]], <1 x double> [[TMP50]], <1 x double> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_f64(float64_t *a, float64x1x4_t b) {
-  vst4_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_p8(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <8 x i8>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 193514046488576
-// CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <8 x i8>, ptr [[TMP28]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 193514046488576
-// CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <8 x i8>, ptr [[TMP32]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <8 x i8>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i8> [[_MSLD7]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i8> [[_MSLD8]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[_MSLD9]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP39]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[_MSLD10]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP40]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF2]]
-// CHECK:       41:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       42:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP21]], <8 x i8> [[TMP25]], <8 x i8> [[TMP29]], <8 x i8> [[TMP33]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
-  vst4_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_p16(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <4 x i16>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <4 x i16>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <4 x i16>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i16> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i16> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <4 x i16>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i16> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <4 x i16>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i16> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i16> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <4 x i16>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <4 x i16> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <4 x i16> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <4 x i16>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <4 x i16>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <4 x i16>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <4 x i16>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <4 x i16>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <4 x i16>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <4 x i16>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <4 x i16>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <4 x i16> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <4 x i16> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <4 x i16> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i16> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP46]], <4 x i16> [[TMP48]], <4 x i16> [[TMP50]], <4 x i16> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
-  vst4_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld1q_f64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP22]]
-//
-float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
-  return vld1q_f64_x2(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld1q_p64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [2 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP22]]
-//
-poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
-  return vld1q_p64_x2(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld1_f64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP22]]
-//
-float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
-  return vld1_f64_x2(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld1_p64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 16)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [2 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [2 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP22]]
-//
-poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
-  return vld1_p64_x2(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld1q_f64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP22]]
-//
-float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
-  return vld1q_f64_x3(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld1q_p64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 48)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X3_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [3 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP22]]
-//
-poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
-  return vld1q_p64_x3(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld1_f64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP22]]
-//
-float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
-  return vld1_f64_x3(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld1_p64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 24)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X3_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [3 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [3 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP22]]
-//
-poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
-  return vld1_p64_x3(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld1q_f64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP22]]
-//
-float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
-  return vld1q_f64_x4(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld1q_p64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP2]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP11]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr [[TMP20]], align 16
-// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]], align 16
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 64)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X2X4_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <2 x i64>] }, ptr [[TMP25]], align 16
-// CHECK-NEXT:    store { [4 x <2 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP22]]
-//
-poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
-  return vld1q_p64_x4(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld1_f64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP22]]
-//
-float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
-  return vld1_f64_x4(a);
-}
-
-// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld1_p64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
-// CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
-// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP5]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
-// CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF2]]
-// CHECK:       16:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       17:
-// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[TMP12]])
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[__RET]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr [[TMP20]], align 8
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]], align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = call ptr @__msan_memcpy(ptr [[RETVAL]], ptr [[__RET]], i64 32)
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__RET]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP22:%.*]] = load [[STRUCT_POLY64X1X4_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[RETVAL]] to i64
-// CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP23]], 193514046488576
-// CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-// CHECK-NEXT:    [[_MSLD1:%.*]] = load { [4 x <1 x i64>] }, ptr [[TMP25]], align 8
-// CHECK-NEXT:    store { [4 x <1 x i64>] } [[_MSLD1]], ptr @__msan_retval_tls, align 8
-// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP22]]
-//
-poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
-  return vld1_p64_x4(a);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP34]], <2 x double> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
-  vst1q_f64_x2(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_p64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD3]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP33]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP35]] to i128
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP34]], <2 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
-  vst1q_p64_x2(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP34]], <1 x double> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
-  vst1_f64_x2(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_p64_x2(
-// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [2 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 16, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 16)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD3:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD4:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[TMP33]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP37]], 0
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP35]] to i64
-// CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP38]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP5]]
-// CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-// CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP39:%.*]], label [[TMP40:%.*]], !prof [[PROF2]]
-// CHECK:       39:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       40:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP34]], <1 x i64> [[TMP36]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
-  vst1_p64_x2(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP40]], <2 x double> [[TMP42]], <2 x double> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
-  vst1q_f64_x3(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_p64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 48, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 48)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD5]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP39]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP41]] to i128
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP43]] to i128
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP40]], <2 x i64> [[TMP42]], <2 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 48, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
-  vst1q_p64_x3(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP40]], <1 x double> [[TMP42]], <1 x double> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
-  vst1_f64_x3(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_p64_x3(
-// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [3 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 24, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 24)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD5:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD5]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD6:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <1 x i64> [[TMP39]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP45]], 0
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <1 x i64> [[TMP41]] to i64
-// CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP46]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP8]]
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <1 x i64> [[TMP43]] to i64
-// CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP47]], 0
-// CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR]], [[_MSCMP9]]
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-// CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP48:%.*]], label [[TMP49:%.*]], !prof [[PROF2]]
-// CHECK:       48:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       49:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP40]], <1 x i64> [[TMP42]], <1 x i64> [[TMP44]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
-  vst1_p64_x3(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x double> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x double> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x double> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x double> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x double>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x double>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x double>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x double>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP46]], <2 x double> [[TMP48]], <2 x double> [[TMP50]], <2 x double> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
-  vst1q_f64_x4(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_p64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <2 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP3]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <2 x i64>] [[TMP0]], ptr [[TMP9]], align 16
-// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 64, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 64)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <2 x i64>, ptr [[TMP24]], align 16
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <2 x i64> [[_MSLD7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP21]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <2 x i64>, ptr [[TMP30]], align 16
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[_MSLD8]] to <16 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <2 x i64> [[TMP27]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <2 x i64>, ptr [[TMP36]], align 16
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[_MSLD9]] to <16 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP33]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <2 x i64>, ptr [[TMP42]], align 16
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <2 x i64> [[_MSLD10]] to <16 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP39]] to <16 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <16 x i8> [[TMP25]] to <2 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <16 x i8> [[TMP26]] to <2 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i8> [[TMP31]] to <2 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i8> [[TMP32]] to <2 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP37]] to <2 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP38]] to <2 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP43]] to <2 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <16 x i8> [[TMP44]] to <2 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP45]] to i128
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <2 x i64> [[TMP47]] to i128
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP49]] to i128
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP51]] to i128
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP46]], <2 x i64> [[TMP48]], <2 x i64> [[TMP50]], <2 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
-  vst1q_p64_x4(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_f64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x double> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x double> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x double> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x double> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x double>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x double>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x double>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x double>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP46]], <1 x double> [[TMP48]], <1 x double> [[TMP50]], <1 x double> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
-  vst1_f64_x4(a, b);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_p64_x4(
-// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <1 x i64>], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-// CHECK-NEXT:    call void @llvm.donothing()
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
-// CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-// CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 -1, i64 8, i1 false)
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[COERCE_DIVE]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
-// CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-NEXT:    store [4 x <1 x i64>] [[TMP0]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
-// CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-// CHECK-NEXT:    store i64 0, ptr [[TMP12]], align 8
-// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[__S1]] to i64
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576
-// CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP15]], i8 -1, i64 32, i1 false)
-// CHECK-NEXT:    [[TMP16:%.*]] = call ptr @__msan_memcpy(ptr [[__S1]], ptr [[B]], i64 32)
-// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[A_ADDR]] to i64
-// CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 193514046488576
-// CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP20]], align 8
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP21:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
-// CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 193514046488576
-// CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-// CHECK-NEXT:    [[_MSLD7:%.*]] = load <1 x i64>, ptr [[TMP24]], align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <1 x i64> [[_MSLD7]] to <8 x i8>
-// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK-NEXT:    [[TMP27:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
-// CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 193514046488576
-// CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-// CHECK-NEXT:    [[_MSLD8:%.*]] = load <1 x i64>, ptr [[TMP30]], align 8
-// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <1 x i64> [[_MSLD8]] to <8 x i8>
-// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <1 x i64> [[TMP27]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK-NEXT:    [[TMP33:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP34:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
-// CHECK-NEXT:    [[TMP35:%.*]] = xor i64 [[TMP34]], 193514046488576
-// CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-// CHECK-NEXT:    [[_MSLD9:%.*]] = load <1 x i64>, ptr [[TMP36]], align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <1 x i64> [[_MSLD9]] to <8 x i8>
-// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <1 x i64> [[TMP33]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK-NEXT:    [[TMP39:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
-// CHECK-NEXT:    [[TMP41:%.*]] = xor i64 [[TMP40]], 193514046488576
-// CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-// CHECK-NEXT:    [[_MSLD10:%.*]] = load <1 x i64>, ptr [[TMP42]], align 8
-// CHECK-NEXT:    [[TMP43:%.*]] = bitcast <1 x i64> [[_MSLD10]] to <8 x i8>
-// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <1 x i64> [[TMP39]] to <8 x i8>
-// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64>
-// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <8 x i8> [[TMP26]] to <1 x i64>
-// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64>
-// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i8> [[TMP32]] to <1 x i64>
-// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64>
-// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <8 x i8> [[TMP38]] to <1 x i64>
-// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <8 x i8> [[TMP43]] to <1 x i64>
-// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <8 x i8> [[TMP44]] to <1 x i64>
-// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <1 x i64> [[TMP45]] to i64
-// CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP53]], 0
-// CHECK-NEXT:    [[TMP54:%.*]] = bitcast <1 x i64> [[TMP47]] to i64
-// CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i64 [[TMP54]], 0
-// CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP11]]
-// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <1 x i64> [[TMP49]] to i64
-// CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i64 [[TMP55]], 0
-// CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSOR]], [[_MSCMP12]]
-// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <1 x i64> [[TMP51]] to i64
-// CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP56]], 0
-// CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-// CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i64 [[_MSLD]], 0
-// CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-// CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP57:%.*]], label [[TMP58:%.*]], !prof [[PROF2]]
-// CHECK:       57:
-// CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
-// CHECK-NEXT:    unreachable
-// CHECK:       58:
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP46]], <1 x i64> [[TMP48]], <1 x i64> [[TMP50]], <1 x i64> [[TMP52]], ptr [[TMP17]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[__S1]]) #[[ATTR4]]
-// CHECK-NEXT:    ret void
-//
-void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
-  vst1_p64_x4(a, b);
-}
-//.
-// CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
-//.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
new file mode 100644
index 0000000000000..b4f8c62e405e6
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -0,0 +1,1515 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
+; Test basic address sanitizer instrumentation.
+;
+; RUN: build/bin/opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %P)
+  ret void
+}
+
+define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_8b_undefA
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> %B, ptr %P)
+  ret void
+}
+
+define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_8b_undefB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_8b_undefAB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefA
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefAB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefAC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefBC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8b_undefABC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefA
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefAB
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefAC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefBC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefBD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefABC
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefABD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefACD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefBCD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8b_undefABCD
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P)
+  ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P)
+  ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_4h
+; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %P)
+  ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_4h
+; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P)
+  ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_4h
+; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_8h
+; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %P)
+  ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_8h
+; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P)
+  ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_8h
+; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_2s
+; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %P)
+  ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2s
+; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P)
+  ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2s
+; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_4s
+; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %P)
+  ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_4s
+; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P)
+  ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_4s
+; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, ptr) nounwind sanitize_memory readonly
+
+; If there's only one element, st2/3/4 don't make much sense, stick to st1.
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_1d
+; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %P)
+  ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_1d
+; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P)
+  ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_1d
+; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P)
+  ret void
+}
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, ptr) nounwind sanitize_memory readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_2d
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %P)
+  ret void
+}
+
+define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_2d_undefA
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> %B, ptr %P)
+  ret void
+}
+
+define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_2d_undefB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st2_2d_undefAB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefA
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefAB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefAC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefBC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st3_2d_undefABC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, ptr) nounwind sanitize_memory readonly
+
+define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefA
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefAB
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefAC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefAD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefBC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefBD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefCD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefABC
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefABD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefACD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefBCD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}
+
+define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P) nounwind sanitize_memory {
+; CHECK-LABEL: define void @st4_2d_undefABCD
+; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
+  ret void
+}

>From 0c3c3706a5763196edf5ed59f16cbff19479e515 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 16 Jul 2024 18:15:10 +0000
Subject: [PATCH 3/5] Update test comment

---
 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
index b4f8c62e405e6..d6efd5603b0e6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
-; Test basic address sanitizer instrumentation.
+; Test memory sanitizer instrumentation for Arm NEON VST instructions.
 ;
 ; RUN: build/bin/opt < %s -passes=msan -S | FileCheck %s
 ;

>From 665673c0895325af2fcb86449ecefeb8cb225d4f Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 16 Jul 2024 19:42:31 +0000
Subject: [PATCH 4/5] Update test to use track-origins

---
 .../MemorySanitizer/AArch64/neon_vst.ll       | 1867 +++++++++++------
 1 file changed, 1271 insertions(+), 596 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
index d6efd5603b0e6..9b7f508c2a9a8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
 ; Test memory sanitizer instrumentation for Arm NEON VST instructions.
 ;
-; RUN: build/bin/opt < %s -passes=msan -S | FileCheck %s
+; RUN: build/bin/opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
 ;
 ; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
 
@@ -14,21 +14,32 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
 ; CHECK-LABEL: define void @st2_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -40,9 +51,11 @@ define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-LABEL: define void @st2_8b_undefA
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -54,9 +67,18 @@ define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-LABEL: define void @st2_8b_undefB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -68,8 +90,9 @@ define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_
 ; CHECK-LABEL: define void @st2_8b_undefAB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -81,25 +104,41 @@ define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sani
 ; CHECK-LABEL: define void @st3_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -111,10 +150,13 @@ define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-LABEL: define void @st3_8b_undefA
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -126,10 +168,20 @@ define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-LABEL: define void @st3_8b_undefB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -141,15 +193,27 @@ define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-LABEL: define void @st3_8b_undefC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -161,9 +225,11 @@ define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-LABEL: define void @st3_8b_undefAB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -175,9 +241,11 @@ define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-LABEL: define void @st3_8b_undefAC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -189,9 +257,18 @@ define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-LABEL: define void @st3_8b_undefBC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -203,8 +280,9 @@ define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) noun
 ; CHECK-LABEL: define void @st3_8b_undefABC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -216,29 +294,50 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
 ; CHECK-LABEL: define void @st4_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i8> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -250,11 +349,15 @@ define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-LABEL: define void @st4_8b_undefA
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -266,11 +369,22 @@ define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-LABEL: define void @st4_8b_undefB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -282,16 +396,29 @@ define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-LABEL: define void @st4_8b_undefC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -303,19 +430,36 @@ define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-LABEL: define void @st4_8b_undefD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -327,10 +471,13 @@ define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefAB
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -342,10 +489,13 @@ define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefAC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -357,10 +507,20 @@ define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefBC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -372,10 +532,20 @@ define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefBD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -387,9 +557,11 @@ define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefABC
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -401,9 +573,11 @@ define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefABD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -415,9 +589,11 @@ define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefACD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -429,9 +605,18 @@ define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-LABEL: define void @st4_8b_undefBCD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -443,8 +628,9 @@ define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D
 ; CHECK-LABEL: define void @st4_8b_undefABCD
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -462,21 +648,32 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-LABEL: define void @st2_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -488,25 +685,41 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-LABEL: define void @st3_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -518,29 +731,50 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-LABEL: define void @st4_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -558,21 +792,32 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_4h
 ; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -584,25 +829,41 @@ define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_4h
 ; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -614,29 +875,50 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr
 ; CHECK-LABEL: define void @st4_4h
 ; CHECK-SAME: (<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i16> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -654,21 +936,32 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_8h
 ; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -680,25 +973,41 @@ define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_8h
 ; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -710,29 +1019,50 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr
 ; CHECK-LABEL: define void @st4_8h
 ; CHECK-SAME: (<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i16> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -750,21 +1080,32 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_2s
 ; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -776,25 +1117,41 @@ define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_2s
 ; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -806,29 +1163,50 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr
 ; CHECK-LABEL: define void @st4_2s
 ; CHECK-SAME: (<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -844,21 +1222,32 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_4s
 ; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -870,25 +1259,41 @@ define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_4s
 ; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -900,29 +1305,50 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr
 ; CHECK-LABEL: define void @st4_4s
 ; CHECK-SAME: (<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -941,21 +1367,32 @@ define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_1d
 ; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -967,25 +1404,41 @@ define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_1d
 ; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -997,29 +1450,50 @@ define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr
 ; CHECK-LABEL: define void @st4_1d
 ; CHECK-SAME: (<1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1037,21 +1511,32 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-LABEL: define void @st2_2d
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1063,9 +1548,11 @@ define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-LABEL: define void @st2_2d_undefA
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1077,9 +1564,18 @@ define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-LABEL: define void @st2_2d_undefB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1091,8 +1587,9 @@ define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitiz
 ; CHECK-LABEL: define void @st2_2d_undefAB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1104,25 +1601,41 @@ define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind s
 ; CHECK-LABEL: define void @st3_2d
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1134,10 +1647,13 @@ define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-LABEL: define void @st3_2d_undefA
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1149,10 +1665,20 @@ define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-LABEL: define void @st3_2d_undefB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1164,15 +1690,27 @@ define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-LABEL: define void @st3_2d_undefC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1184,9 +1722,11 @@ define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-LABEL: define void @st3_2d_undefAB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1198,9 +1738,11 @@ define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-LABEL: define void @st3_2d_undefAC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1212,9 +1754,18 @@ define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-LABEL: define void @st3_2d_undefBC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1226,8 +1777,9 @@ define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) n
 ; CHECK-LABEL: define void @st3_2d_undefABC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1239,29 +1791,50 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr
 ; CHECK-LABEL: define void @st4_2d
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1277,11 +1850,15 @@ define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-LABEL: define void @st4_2d_undefA
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1293,11 +1870,22 @@ define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-LABEL: define void @st4_2d_undefB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1309,16 +1897,29 @@ define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-LABEL: define void @st4_2d_undefC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1330,19 +1931,36 @@ define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-LABEL: define void @st4_2d_undefD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1354,10 +1972,13 @@ define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefAB
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1369,10 +1990,13 @@ define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefAC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1384,10 +2008,13 @@ define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefAD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1399,10 +2026,20 @@ define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefBC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1414,10 +2051,20 @@ define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefBD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1429,15 +2076,27 @@ define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefCD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1449,9 +2108,11 @@ define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefABC
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1463,9 +2124,11 @@ define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefABD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1477,9 +2140,11 @@ define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefACD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1491,9 +2156,18 @@ define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-LABEL: define void @st4_2d_undefBCD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1505,8 +2179,9 @@ define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64
 ; CHECK-LABEL: define void @st4_2d_undefABCD
 ; CHECK-SAME: (<2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;

>From 606fbe9359bbbe729eadb1a55865bf9270426074 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 16 Jul 2024 20:00:13 +0000
Subject: [PATCH 5/5] Fix tool path

---
 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
index 9b7f508c2a9a8..9490356716c0e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -1,7 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool opt --version 2
 ; Test memory sanitizer instrumentation for Arm NEON VST instructions.
 ;
-; RUN: build/bin/opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
+; RUN: opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
 ;
 ; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
 



More information about the cfe-commits mailing list