[llvm] [msan][NFCI] Add tests for Arm NEON vector load (PR #125267)

Thurston Dang via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 31 10:57:05 PST 2025


https://github.com/thurstond created https://github.com/llvm/llvm-project/pull/125267

Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll

Incorrectly handled by handleUnknownInstruction:
- llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
- llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
- llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
- llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r

>From 9d384154d9367ec1eec7829aaa99b8572ac3fe6b Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Fri, 31 Jan 2025 18:51:29 +0000
Subject: [PATCH] [msan][NFCI] Add tests for Arm NEON vector load

Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll

Incorrectly handled by handleUnknownInstruction:
- llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
- llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
- llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
- llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r
---
 .../MemorySanitizer/AArch64/arm64-ld1.ll      | 4100 +++++++++++++++++
 1 file changed, 4100 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
new file mode 100644
index 00000000000000..673a14a9371b49
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
@@ -0,0 +1,4100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll
+;
+; Incorrectly handled (handleUnknownInstruction):
+; - llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
+; - llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
+; - llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
+; - llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+  %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x3_t %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x4_t  %tmpvar2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x2_t @ld2_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x3_t @ld3_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x4_t @ld4_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP10]], <1 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x4_t  %tmpvar2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.b { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.b { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP10]], <16 x i8> [[TMP11]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], <16 x i8> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP12]], <16 x i8> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP14]], <16 x i8> [[TMP15]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], <16 x i8> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], <16 x i8> [[L3]], <16 x i8> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP14]], <16 x i8> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP16]], <16 x i8> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP18]], <16 x i8> [[TMP19]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.h { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.h { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP10]], <8 x i16> [[TMP11]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], <8 x i16> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP12]], <8 x i16> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP14]], <8 x i16> [[TMP15]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], <8 x i16> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], <8 x i16> [[L3]], <8 x i16> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP14]], <8 x i16> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP16]], <8 x i16> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP18]], <8 x i16> [[TMP19]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.s { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.s { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP10]], <4 x i32> [[TMP11]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP12]], <4 x i32> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP14]], <4 x i32> [[TMP15]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], <4 x i32> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> [[L3]], <4 x i32> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP14]], <4 x i32> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP16]], <4 x i32> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP18]], <4 x i32> [[TMP19]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.d { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.d { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP10]], <2 x i64> [[TMP11]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], <2 x i64> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP12]], <2 x i64> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP14]], <2 x i64> [[TMP15]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], <2 x i64> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], <2 x i64> [[L3]], <2 x i64> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP14]], <2 x i64> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP16]], <2 x i64> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP18]], <2 x i64> [[TMP19]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+
+define <8 x i8> @ld1r_8b(ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i8> @ld1r_8b(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <8 x i8> [[TMPVAR3]], i8 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <8 x i8> [[TMPVAR4]], i8 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <8 x i8> [[TMPVAR5]], i8 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP4]], i8 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <8 x i8> [[TMPVAR6]], i8 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i8> [[TMPVAR7]], i8 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[TMP9]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <8 x i8> %tmpvar2, i8 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <8 x i8> %tmpvar3, i8 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <8 x i8> %tmpvar4, i8 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <8 x i8> %tmpvar5, i8 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <8 x i8> %tmpvar6, i8 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <8 x i8> %tmpvar7, i8 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <8 x i8> %tmpvar8, i8 %tmpvar1, i32 7
+  ret <8 x i8> %tmpvar9
+}
+
+define <16 x i8> @ld1r_16b(ptr %bar) #0 {
+; CHECK-LABEL: define <16 x i8> @ld1r_16b(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <16 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <16 x i8> [[TMPVAR3]], i8 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <16 x i8> [[TMPVAR4]], i8 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <16 x i8> [[TMPVAR5]], i8 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <16 x i8> [[TMPVAR6]], i8 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <16 x i8> [[TMPVAR7]], i8 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMPVAR9:%.*]] = insertelement <16 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 [[_MSLD]], i32 8
+; CHECK-NEXT:    [[TMPVAR10:%.*]] = insertelement <16 x i8> [[TMPVAR9]], i8 [[TMPVAR1]], i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 [[_MSLD]], i32 9
+; CHECK-NEXT:    [[TMPVAR11:%.*]] = insertelement <16 x i8> [[TMPVAR10]], i8 [[TMPVAR1]], i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 [[_MSLD]], i32 10
+; CHECK-NEXT:    [[TMPVAR12:%.*]] = insertelement <16 x i8> [[TMPVAR11]], i8 [[TMPVAR1]], i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 [[_MSLD]], i32 11
+; CHECK-NEXT:    [[TMPVAR13:%.*]] = insertelement <16 x i8> [[TMPVAR12]], i8 [[TMPVAR1]], i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 [[_MSLD]], i32 12
+; CHECK-NEXT:    [[TMPVAR14:%.*]] = insertelement <16 x i8> [[TMPVAR13]], i8 [[TMPVAR1]], i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 [[_MSLD]], i32 13
+; CHECK-NEXT:    [[TMPVAR15:%.*]] = insertelement <16 x i8> [[TMPVAR14]], i8 [[TMPVAR1]], i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[_MSLD]], i32 14
+; CHECK-NEXT:    [[TMPVAR16:%.*]] = insertelement <16 x i8> [[TMPVAR15]], i8 [[TMPVAR1]], i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[_MSLD]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> [[TMPVAR16]], i8 [[TMPVAR1]], i32 15
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP17]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <16 x i8> %tmpvar2, i8 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <16 x i8> %tmpvar3, i8 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <16 x i8> %tmpvar4, i8 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <16 x i8> %tmpvar5, i8 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <16 x i8> %tmpvar6, i8 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <16 x i8> %tmpvar7, i8 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <16 x i8> %tmpvar8, i8 %tmpvar1, i32 7
+  %tmpvar10 = insertelement <16 x i8> %tmpvar9, i8 %tmpvar1, i32 8
+  %tmpvar11 = insertelement <16 x i8> %tmpvar10, i8 %tmpvar1, i32 9
+  %tmpvar12 = insertelement <16 x i8> %tmpvar11, i8 %tmpvar1, i32 10
+  %tmpvar13 = insertelement <16 x i8> %tmpvar12, i8 %tmpvar1, i32 11
+  %tmpvar14 = insertelement <16 x i8> %tmpvar13, i8 %tmpvar1, i32 12
+  %tmpvar15 = insertelement <16 x i8> %tmpvar14, i8 %tmpvar1, i32 13
+  %tmpvar16 = insertelement <16 x i8> %tmpvar15, i8 %tmpvar1, i32 14
+  %tmpvar17 = insertelement <16 x i8> %tmpvar16, i8 %tmpvar1, i32 15
+  ret <16 x i8> %tmpvar17
+}
+
+define <4 x i16> @ld1r_4h(ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i16> @ld1r_4h(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x i16> [[TMPVAR3]], i16 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i16> [[_MSPROP2]], i16 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <4 x i16> [[TMPVAR4]], i16 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    store <4 x i16> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i16> [[TMPVAR5]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <4 x i16> %tmpvar2, i16 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <4 x i16> %tmpvar3, i16 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <4 x i16> %tmpvar4, i16 %tmpvar1, i32 3
+  ret <4 x i16> %tmpvar5
+}
+
+define <8 x i16> @ld1r_8h(ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i16> @ld1r_8h(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <8 x i16> [[TMPVAR3]], i16 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <8 x i16> [[TMPVAR4]], i16 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <8 x i16> [[TMPVAR5]], i16 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP4]], i16 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <8 x i16> [[TMPVAR6]], i16 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i16> [[TMPVAR7]], i16 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i16> [[TMPVAR8]], i16 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[TMP9]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <8 x i16> %tmpvar2, i16 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <8 x i16> %tmpvar3, i16 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <8 x i16> %tmpvar4, i16 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <8 x i16> %tmpvar5, i16 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <8 x i16> %tmpvar6, i16 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <8 x i16> %tmpvar7, i16 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <8 x i16> %tmpvar8, i16 %tmpvar1, i32 7
+  ret <8 x i16> %tmpvar9
+}
+
+define <2 x i32> @ld1r_2s(ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i32> @ld1r_2s(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <2 x i32> %tmpvar2, i32 %tmpvar1, i32 1
+  ret <2 x i32> %tmpvar3
+}
+
+define <4 x i32> @ld1r_4s(ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i32> @ld1r_4s(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x i32> [[TMPVAR3]], i32 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <4 x i32> [[TMPVAR4]], i32 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMPVAR5]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <4 x i32> %tmpvar2, i32 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <4 x i32> %tmpvar3, i32 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <4 x i32> %tmpvar4, i32 %tmpvar1, i32 3
+  ret <4 x i32> %tmpvar5
+}
+
+define <2 x i64> @ld1r_2d(ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i64> @ld1r_2d(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i64, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> undef, i64 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMPVAR2]], i64 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i64, ptr %bar
+  %tmpvar2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <2 x i64> %tmpvar2, i64 %tmpvar1, i32 1
+  ret <2 x i64> %tmpvar3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) #0 {
+; CHECK-SD-LABEL: ld1_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b1, [x0]
+; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
+; CHECK-GI-NEXT:    ret
+; Make sure we are using the operands defined by the ABI
+; CHECK-LABEL: define <16 x i8> @ld1_16b(
+; CHECK-SAME: <16 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> [[V]], i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMPVAR2]]
+;
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <16 x i8> %V, i8 %tmpvar1, i32 0
+  ret <16 x i8> %tmpvar2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i16> @ld1_8h(
+; CHECK-SAME: <8 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> [[V]], i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <8 x i16> %V, i16 %tmpvar1, i32 0
+  ret <8 x i16> %tmpvar2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i32> @ld1_4s(
+; CHECK-SAME: <4 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> [[V]], i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <4 x i32> %V, i32 %tmpvar1, i32 0
+  ret <4 x i32> %tmpvar2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x float> @ld1_4s_float(
+; CHECK-SAME: <4 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load float, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x float> [[V]], float [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load float, ptr %bar
+  %tmpvar2 = insertelement <4 x float> %V, float %tmpvar1, i32 0
+  ret <4 x float> %tmpvar2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i64> @ld1_2d(
+; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i64, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> [[V]], i64 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i64, ptr %bar
+  %tmpvar2 = insertelement <2 x i64> %V, i64 %tmpvar1, i32 0
+  ret <2 x i64> %tmpvar2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x double> @ld1_2d_double(
+; CHECK-SAME: <2 x double> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load double, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x double> [[V]], double [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load double, ptr %bar
+  %tmpvar2 = insertelement <2 x double> %V, double %tmpvar1, i32 0
+  ret <2 x double> %tmpvar2
+}
+
+define <1 x i64> @ld1_1d(ptr %p) #0 {
+; CHECK-LABEL: define <1 x i64> @ld1_1d(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP:%.*]] = load <1 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x i64> [[TMP]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load <1 x i64>, ptr %p, align 8
+  ret <1 x i64> %tmpvar
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) #0 {
+; CHECK-SD-LABEL: ld1_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b1, [x0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+; Make sure we are using the operands defined by the ABI
+; CHECK-LABEL: define <8 x i8> @ld1_8b(
+; CHECK-SAME: <8 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> [[V]], i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[TMPVAR2]]
+;
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <8 x i8> %V, i8 %tmpvar1, i32 0
+  ret <8 x i8> %tmpvar2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i16> @ld1_4h(
+; CHECK-SAME: <4 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> [[V]], i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i16> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <4 x i16> %V, i16 %tmpvar1, i32 0
+  ret <4 x i16> %tmpvar2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i32> @ld1_2s(
+; CHECK-SAME: <2 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> [[V]], i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i32> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <2 x i32> %V, i32 %tmpvar1, i32 0
+  ret <2 x i32> %tmpvar2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x float> @ld1_2s_float(
+; CHECK-SAME: <2 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load float, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x float> [[V]], float [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load float, ptr %bar
+  %tmpvar2 = insertelement <2 x float> %V, float %tmpvar1, i32 0
+  ret <2 x float> %tmpvar2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture %diff) nounwind ssp #0 {
+; CHECK-SD-LABEL: ld1r_2s_from_dup:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1r_2s_from_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r.2s { v0 }, [x0]
+; CHECK-GI-NEXT:    ld1r.2s { v1 }, [x1]
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    str d0, [x2]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define void @ld1r_2s_from_dup(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture [[B:%.*]], ptr nocapture [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP22:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMPVAR2]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label [[TMP23:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR5]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <2 x i32> [[_MSPROP3]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE1:%.*]] = shufflevector <2 x i32> [[TMPVAR6]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSPROP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = bitcast <2 x i32> [[LANE1]] to <8 x i8>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i16>
+; CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[TMPVAR7]] to <8 x i16>
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = or <8 x i16> [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP7]] to <2 x i64>
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = bitcast <8 x i16> [[SUB_I]] to <2 x i64>
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <2 x i64> [[TMP15]], <2 x i64> splat (i64 -1), <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMPVAR8]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[_MSPROP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[DIFF]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    store <4 x i16> [[TMP16]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[DIFF]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmpvar1 = load i32, ptr %a, align 4
+  %tmpvar2 = insertelement <2 x i32> undef, i32 %tmpvar1, i32 0
+  %lane = shufflevector <2 x i32> %tmpvar2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmpvar5 = load i32, ptr %b, align 4
+  %tmpvar6 = insertelement <2 x i32> undef, i32 %tmpvar5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmpvar6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmpvar3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmpvar7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmpvar8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmpvar8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmpvar9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  store <4 x i16> %tmpvar9, ptr %diff, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <4 x float> @ld1r_4s_float(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x float> [[TMPVAR3]], float [[TMP]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMPVAR4]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
+  %tmpvar2 = insertelement <4 x float> %tmpvar1, float %tmpvar, i32 1
+  %tmpvar3 = insertelement <4 x float> %tmpvar2, float %tmpvar, i32 2
+  %tmpvar4 = insertelement <4 x float> %tmpvar3, float %tmpvar, i32 3
+  ret <4 x float> %tmpvar4
+}
+
+define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x float> @ld1r_2s_float(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
+  %tmpvar2 = insertelement <2 x float> %tmpvar1, float %tmpvar, i32 1
+  ret <2 x float> %tmpvar2
+}
+
+define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x double> @ld1r_2d_double(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
+  %tmpvar2 = insertelement <2 x double> %tmpvar1, double %tmpvar, i32 1
+  ret <2 x double> %tmpvar2
+}
+
+define <1 x double> @ld1r_1d_double(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <1 x double> @ld1r_1d_double(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x double> [[TMP1]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
+  ret <1 x double> %tmpvar1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <4 x float> @ld1r_4s_float_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[_MSPROP]], <4 x i32> splat (i32 -1), <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
+  %lane = shufflevector <4 x float> %tmpvar1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x float> @ld1r_2s_float_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
+  %lane = shufflevector <2 x float> %tmpvar1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x double> @ld1r_2d_double_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[_MSPROP]], <2 x i64> splat (i64 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
+  %lane = shufflevector <2 x double> %tmpvar1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <1 x double> @ld1r_1d_double_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <1 x i64> [[_MSPROP]], <1 x i64> splat (i64 -1), <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x double> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
+  %lane = shufflevector <1 x double> %tmpvar1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X2_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X2_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X2_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP8]], <2 x float> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP8]], <4 x float> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP8]], <2 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP8]], <2 x float> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP10]], <2 x float> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP10]], <1 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP8]], <4 x float> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP10]], <4 x float> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP8]], <2 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP10]], <2 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
+
+define <8 x i8> @dup_ld1_from_stack(ptr %__ret) #0 {
+; CHECK-SD-LABEL: dup_ld1_from_stack:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #15
+; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_ld1_from_stack:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    add x8, sp, #15
+; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define <8 x i8> @dup_ld1_from_stack(
+; CHECK-SAME: ptr [[__RET:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[ITEM:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ITEM]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP2]], i8 -1, i64 1, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ITEM]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[ITEM]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> poison, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i8> [[_MSPROP]], <8 x i8> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[LANE]]
+;
+entry:
+  %item = alloca i8, align 1
+  %0 = load i8, ptr %item, align 1
+  %1 = insertelement <8 x i8> poison, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> %1, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.



More information about the llvm-commits mailing list