[llvm] [msan][NFCI] Add tests for Arm NEON vector load (PR #125267)

Thurston Dang via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 5 18:26:28 PST 2025


https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/125267

>From 9d384154d9367ec1eec7829aaa99b8572ac3fe6b Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Fri, 31 Jan 2025 18:51:29 +0000
Subject: [PATCH 1/3] [msan][NFCI] Add tests for Arm NEON vector load

Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll

Incorrectly handled by handleUnknownInstruction:
- llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
- llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
- llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
- llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r
---
 .../MemorySanitizer/AArch64/arm64-ld1.ll      | 4100 +++++++++++++++++
 1 file changed, 4100 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
new file mode 100644
index 0000000000000..673a14a9371b4
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
@@ -0,0 +1,4100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll
+;
+; Incorrectly handled (handleUnknownInstruction):
+; - llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
+; - llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
+; - llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
+; - llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+  %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x3_t %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4_1di64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x4_t  %tmpvar2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x2_t @ld2_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x3_t @ld3_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x4_t @ld4_1df64(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP10]], <1 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr %A)
+  ret %struct.__neon_float64x1x4_t  %tmpvar2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.b { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.b { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP10]], <16 x i8> [[TMP11]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], <16 x i8> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP12]], <16 x i8> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP14]], <16 x i8> [[TMP15]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4lane_16b(
+; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], <16 x i8> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[L1]], <16 x i8> [[L2]], <16 x i8> [[L3]], <16 x i8> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP14]], <16 x i8> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP16]], <16 x i8> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP18]], <16 x i8> [[TMP19]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.h { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.h { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP10]], <8 x i16> [[TMP11]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], <8 x i16> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP12]], <8 x i16> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP14]], <8 x i16> [[TMP15]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4lane_8h(
+; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], <8 x i16> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[L1]], <8 x i16> [[L2]], <8 x i16> [[L3]], <8 x i16> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP14]], <8 x i16> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP16]], <8 x i16> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP18]], <8 x i16> [[TMP19]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.s { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.s { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP10]], <4 x i32> [[TMP11]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP12]], <4 x i32> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP14]], <4 x i32> [[TMP15]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4lane_4s(
+; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], <4 x i32> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> [[L3]], <4 x i32> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP14]], <4 x i32> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP16]], <4 x i32> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP18]], <4 x i32> [[TMP19]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld2lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ld2.d { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld2lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ld2.d { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP8]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP10]], <2 x i64> [[TMP11]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP12]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld3lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld3lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], <2 x i64> [[L3]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP12]], <2 x i64> [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP10]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP14]], <2 x i64> [[TMP15]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP16]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
+; CHECK-SD-LABEL: ld4lane_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld4lane_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4lane_2d(
+; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], <2 x i64> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[L1]], <2 x i64> [[L2]], <2 x i64> [[L3]], <2 x i64> [[L4]], i64 1, ptr [[A]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP14]], <2 x i64> [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP16]], <2 x i64> [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP18]], <2 x i64> [[TMP19]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP20]]
+;
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
+
+define <8 x i8> @ld1r_8b(ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i8> @ld1r_8b(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <8 x i8> [[TMPVAR3]], i8 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <8 x i8> [[TMPVAR4]], i8 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <8 x i8> [[TMPVAR5]], i8 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP4]], i8 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <8 x i8> [[TMPVAR6]], i8 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i8> [[TMPVAR7]], i8 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[TMP9]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <8 x i8> %tmpvar2, i8 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <8 x i8> %tmpvar3, i8 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <8 x i8> %tmpvar4, i8 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <8 x i8> %tmpvar5, i8 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <8 x i8> %tmpvar6, i8 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <8 x i8> %tmpvar7, i8 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <8 x i8> %tmpvar8, i8 %tmpvar1, i32 7
+  ret <8 x i8> %tmpvar9
+}
+
+define <16 x i8> @ld1r_16b(ptr %bar) #0 {
+; CHECK-LABEL: define <16 x i8> @ld1r_16b(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <16 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <16 x i8> [[TMPVAR3]], i8 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <16 x i8> [[TMPVAR4]], i8 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <16 x i8> [[TMPVAR5]], i8 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <16 x i8> [[TMPVAR6]], i8 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <16 x i8> [[TMPVAR7]], i8 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMPVAR9:%.*]] = insertelement <16 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 [[_MSLD]], i32 8
+; CHECK-NEXT:    [[TMPVAR10:%.*]] = insertelement <16 x i8> [[TMPVAR9]], i8 [[TMPVAR1]], i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 [[_MSLD]], i32 9
+; CHECK-NEXT:    [[TMPVAR11:%.*]] = insertelement <16 x i8> [[TMPVAR10]], i8 [[TMPVAR1]], i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 [[_MSLD]], i32 10
+; CHECK-NEXT:    [[TMPVAR12:%.*]] = insertelement <16 x i8> [[TMPVAR11]], i8 [[TMPVAR1]], i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 [[_MSLD]], i32 11
+; CHECK-NEXT:    [[TMPVAR13:%.*]] = insertelement <16 x i8> [[TMPVAR12]], i8 [[TMPVAR1]], i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 [[_MSLD]], i32 12
+; CHECK-NEXT:    [[TMPVAR14:%.*]] = insertelement <16 x i8> [[TMPVAR13]], i8 [[TMPVAR1]], i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 [[_MSLD]], i32 13
+; CHECK-NEXT:    [[TMPVAR15:%.*]] = insertelement <16 x i8> [[TMPVAR14]], i8 [[TMPVAR1]], i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[_MSLD]], i32 14
+; CHECK-NEXT:    [[TMPVAR16:%.*]] = insertelement <16 x i8> [[TMPVAR15]], i8 [[TMPVAR1]], i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[_MSLD]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> [[TMPVAR16]], i8 [[TMPVAR1]], i32 15
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP17]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <16 x i8> %tmpvar2, i8 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <16 x i8> %tmpvar3, i8 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <16 x i8> %tmpvar4, i8 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <16 x i8> %tmpvar5, i8 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <16 x i8> %tmpvar6, i8 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <16 x i8> %tmpvar7, i8 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <16 x i8> %tmpvar8, i8 %tmpvar1, i32 7
+  %tmpvar10 = insertelement <16 x i8> %tmpvar9, i8 %tmpvar1, i32 8
+  %tmpvar11 = insertelement <16 x i8> %tmpvar10, i8 %tmpvar1, i32 9
+  %tmpvar12 = insertelement <16 x i8> %tmpvar11, i8 %tmpvar1, i32 10
+  %tmpvar13 = insertelement <16 x i8> %tmpvar12, i8 %tmpvar1, i32 11
+  %tmpvar14 = insertelement <16 x i8> %tmpvar13, i8 %tmpvar1, i32 12
+  %tmpvar15 = insertelement <16 x i8> %tmpvar14, i8 %tmpvar1, i32 13
+  %tmpvar16 = insertelement <16 x i8> %tmpvar15, i8 %tmpvar1, i32 14
+  %tmpvar17 = insertelement <16 x i8> %tmpvar16, i8 %tmpvar1, i32 15
+  ret <16 x i8> %tmpvar17
+}
+
+define <4 x i16> @ld1r_4h(ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i16> @ld1r_4h(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x i16> [[TMPVAR3]], i16 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i16> [[_MSPROP2]], i16 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <4 x i16> [[TMPVAR4]], i16 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    store <4 x i16> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i16> [[TMPVAR5]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <4 x i16> %tmpvar2, i16 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <4 x i16> %tmpvar3, i16 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <4 x i16> %tmpvar4, i16 %tmpvar1, i32 3
+  ret <4 x i16> %tmpvar5
+}
+
+define <8 x i16> @ld1r_8h(ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i16> @ld1r_8h(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <8 x i16> [[TMPVAR3]], i16 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <8 x i16> [[TMPVAR4]], i16 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <8 x i16> [[TMPVAR5]], i16 [[TMPVAR1]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP4]], i16 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = insertelement <8 x i16> [[TMPVAR6]], i16 [[TMPVAR1]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i16> [[TMPVAR7]], i16 [[TMPVAR1]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i16> [[TMPVAR8]], i16 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[TMP9]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <8 x i16> %tmpvar2, i16 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <8 x i16> %tmpvar3, i16 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <8 x i16> %tmpvar4, i16 %tmpvar1, i32 3
+  %tmpvar6 = insertelement <8 x i16> %tmpvar5, i16 %tmpvar1, i32 4
+  %tmpvar7 = insertelement <8 x i16> %tmpvar6, i16 %tmpvar1, i32 5
+  %tmpvar8 = insertelement <8 x i16> %tmpvar7, i16 %tmpvar1, i32 6
+  %tmpvar9 = insertelement <8 x i16> %tmpvar8, i16 %tmpvar1, i32 7
+  ret <8 x i16> %tmpvar9
+}
+
+define <2 x i32> @ld1r_2s(ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i32> @ld1r_2s(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <2 x i32> %tmpvar2, i32 %tmpvar1, i32 1
+  ret <2 x i32> %tmpvar3
+}
+
+define <4 x i32> @ld1r_4s(ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i32> @ld1r_4s(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x i32> [[TMPVAR3]], i32 [[TMPVAR1]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = insertelement <4 x i32> [[TMPVAR4]], i32 [[TMPVAR1]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMPVAR5]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <4 x i32> %tmpvar2, i32 %tmpvar1, i32 1
+  %tmpvar4 = insertelement <4 x i32> %tmpvar3, i32 %tmpvar1, i32 2
+  %tmpvar5 = insertelement <4 x i32> %tmpvar4, i32 %tmpvar1, i32 3
+  ret <4 x i32> %tmpvar5
+}
+
+define <2 x i64> @ld1r_2d(ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i64> @ld1r_2d(
+; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i64, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> undef, i64 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMPVAR2]], i64 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i64, ptr %bar
+  %tmpvar2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmpvar1, i32 0
+  %tmpvar3 = insertelement <2 x i64> %tmpvar2, i64 %tmpvar1, i32 1
+  ret <2 x i64> %tmpvar3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4r_8b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr %A)
+  ret %struct.__neon_int8x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x2_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x3_t  %tmpvar2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4r_16b(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr %A)
+  ret %struct.__neon_int8x16x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4r_4h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr %A)
+  ret %struct.__neon_int16x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x2_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x3_t  %tmpvar2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4r_8h(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr %A)
+  ret %struct.__neon_int16x8x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4r_2s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr %A)
+  ret %struct.__neon_int32x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x2_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x3_t  %tmpvar2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4r_4s(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr %A)
+  ret %struct.__neon_int32x4x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4r_1d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %A)
+  ret %struct.__neon_int64x1x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x2_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x3_t  %tmpvar2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(ptr %A) nounwind #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4r_2d(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %A)
+  ret %struct.__neon_int64x2x4_t  %tmpvar2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) #0 {
+; CHECK-SD-LABEL: ld1_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b1, [x0]
+; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
+; CHECK-GI-NEXT:    ret
+; Make sure we are using the operands defined by the ABI
+; CHECK-LABEL: define <16 x i8> @ld1_16b(
+; CHECK-SAME: <16 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> [[V]], i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMPVAR2]]
+;
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <16 x i8> %V, i8 %tmpvar1, i32 0
+  ret <16 x i8> %tmpvar2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <8 x i16> @ld1_8h(
+; CHECK-SAME: <8 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> [[V]], i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <8 x i16> %V, i16 %tmpvar1, i32 0
+  ret <8 x i16> %tmpvar2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i32> @ld1_4s(
+; CHECK-SAME: <4 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> [[V]], i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <4 x i32> %V, i32 %tmpvar1, i32 0
+  ret <4 x i32> %tmpvar2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x float> @ld1_4s_float(
+; CHECK-SAME: <4 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load float, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x float> [[V]], float [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load float, ptr %bar
+  %tmpvar2 = insertelement <4 x float> %V, float %tmpvar1, i32 0
+  ret <4 x float> %tmpvar2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i64> @ld1_2d(
+; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i64, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> [[V]], i64 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i64, ptr %bar
+  %tmpvar2 = insertelement <2 x i64> %V, i64 %tmpvar1, i32 0
+  ret <2 x i64> %tmpvar2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x double> @ld1_2d_double(
+; CHECK-SAME: <2 x double> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load double, ptr [[BAR]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x double> [[V]], double [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load double, ptr %bar
+  %tmpvar2 = insertelement <2 x double> %V, double %tmpvar1, i32 0
+  ret <2 x double> %tmpvar2
+}
+
+define <1 x i64> @ld1_1d(ptr %p) #0 {
+; CHECK-LABEL: define <1 x i64> @ld1_1d(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP:%.*]] = load <1 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x i64> [[TMP]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load <1 x i64>, ptr %p, align 8
+  ret <1 x i64> %tmpvar
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) #0 {
+; CHECK-SD-LABEL: ld1_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b1, [x0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+; Make sure we are using the operands defined by the ABI
+; CHECK-LABEL: define <8 x i8> @ld1_8b(
+; CHECK-SAME: <8 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i8, ptr [[BAR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> [[V]], i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[TMPVAR2]]
+;
+  %tmpvar1 = load i8, ptr %bar
+  %tmpvar2 = insertelement <8 x i8> %V, i8 %tmpvar1, i32 0
+  ret <8 x i8> %tmpvar2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <4 x i16> @ld1_4h(
+; CHECK-SAME: <4 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i16, ptr [[BAR]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> [[V]], i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i16> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i16, ptr %bar
+  %tmpvar2 = insertelement <4 x i16> %V, i16 %tmpvar1, i32 0
+  ret <4 x i16> %tmpvar2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x i32> @ld1_2s(
+; CHECK-SAME: <2 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> [[V]], i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i32> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load i32, ptr %bar
+  %tmpvar2 = insertelement <2 x i32> %V, i32 %tmpvar1, i32 0
+  ret <2 x i32> %tmpvar2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
+; CHECK-LABEL: define <2 x float> @ld1_2s_float(
+; CHECK-SAME: <2 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load float, ptr [[BAR]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BAR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x float> [[V]], float [[TMPVAR1]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[TMPVAR2]]
+;
+; Make sure we are using the operands defined by the ABI
+  %tmpvar1 = load float, ptr %bar
+  %tmpvar2 = insertelement <2 x float> %V, float %tmpvar1, i32 0
+  ret <2 x float> %tmpvar2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture %diff) nounwind ssp #0 {
+; CHECK-SD-LABEL: ld1r_2s_from_dup:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
+; CHECK-SD-NEXT:    str d0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ld1r_2s_from_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r.2s { v0 }, [x0]
+; CHECK-GI-NEXT:    ld1r.2s { v1 }, [x1]
+; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
+; CHECK-GI-NEXT:    str d0, [x2]
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define void @ld1r_2s_from_dup(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture [[B:%.*]], ptr nocapture [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP22:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMPVAR2]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label [[TMP23:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMPVAR5:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR5]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <2 x i32> [[_MSPROP3]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE1:%.*]] = shufflevector <2 x i32> [[TMPVAR6]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSPROP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMPVAR7:%.*]] = bitcast <2 x i32> [[LANE1]] to <8 x i8>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i16>
+; CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[TMPVAR7]] to <8 x i16>
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = or <8 x i16> [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP7]] to <2 x i64>
+; CHECK-NEXT:    [[TMPVAR8:%.*]] = bitcast <8 x i16> [[SUB_I]] to <2 x i64>
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <2 x i64> [[TMP15]], <2 x i64> splat (i64 -1), <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMPVAR8]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[_MSPROP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[DIFF]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
+; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    store <4 x i16> [[TMP16]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[DIFF]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmpvar1 = load i32, ptr %a, align 4
+  %tmpvar2 = insertelement <2 x i32> undef, i32 %tmpvar1, i32 0
+  %lane = shufflevector <2 x i32> %tmpvar2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmpvar5 = load i32, ptr %b, align 4
+  %tmpvar6 = insertelement <2 x i32> undef, i32 %tmpvar5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmpvar6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmpvar3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmpvar7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmpvar8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmpvar8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmpvar9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  store <4 x i16> %tmpvar9, ptr %diff, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <4 x float> @ld1r_4s_float(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x float> [[TMPVAR3]], float [[TMP]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMPVAR4]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
+  %tmpvar2 = insertelement <4 x float> %tmpvar1, float %tmpvar, i32 1
+  %tmpvar3 = insertelement <4 x float> %tmpvar2, float %tmpvar, i32 2
+  %tmpvar4 = insertelement <4 x float> %tmpvar3, float %tmpvar, i32 3
+  ret <4 x float> %tmpvar4
+}
+
+define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x float> @ld1r_2s_float(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
+  %tmpvar2 = insertelement <2 x float> %tmpvar1, float %tmpvar, i32 1
+  ret <2 x float> %tmpvar2
+}
+
+define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x double> @ld1r_2d_double(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
+  %tmpvar2 = insertelement <2 x double> %tmpvar1, double %tmpvar, i32 1
+  ret <2 x double> %tmpvar2
+}
+
+define <1 x double> @ld1r_1d_double(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <1 x double> @ld1r_1d_double(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x double> [[TMP1]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
+  ret <1 x double> %tmpvar1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <4 x float> @ld1r_4s_float_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[_MSPROP]], <4 x i32> splat (i32 -1), <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
+  %lane = shufflevector <4 x float> %tmpvar1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x float> @ld1r_2s_float_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x float> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load float, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
+  %lane = shufflevector <2 x float> %tmpvar1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <2 x double> @ld1r_2d_double_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[_MSPROP]], <2 x i64> splat (i64 -1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
+  %lane = shufflevector <2 x double> %tmpvar1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(ptr nocapture %x) #0 {
+; CHECK-LABEL: define <1 x double> @ld1r_1d_double_shuff(
+; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK:       1:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <1 x i64> [[_MSPROP]], <1 x i64> splat (i64 -1), <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x double> [[LANE]]
+;
+entry:
+; Make sure we are using the operands defined by the ABI
+  %tmpvar = load double, ptr %x, align 4
+  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
+  %lane = shufflevector <1 x double> %tmpvar1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X2_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X2_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X2_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X2_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X2_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X2_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X2_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X2_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X2_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X2_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X2_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X2_T]] [[TMP8]]
+;
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X3_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X3_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X3_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP8]], <2 x float> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X3_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X3_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X3_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X3_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP8]], <4 x float> [[TMP9]], 2
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X3_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP8]], <2 x double> [[TMP9]], 2
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X3_T]] [[TMP10]]
+;
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T:%.*]] poison, <8 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP6]], <8 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP8]], <8 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X8X4_T]] [[TMP10]], <8 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0(ptr %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T:%.*]] poison, <4 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP6]], <4 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP8]], <4 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X4X4_T]] [[TMP10]], <4 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0(ptr %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T:%.*]] poison, <2 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP6]], <2 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP8]], <2 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X2X4_T]] [[TMP10]], <2 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0(ptr %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T:%.*]] poison, <2 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP6]], <2 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP8]], <2 x float> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP10]], <2 x float> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0(ptr %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T:%.*]] poison, <1 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP6]], <1 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP8]], <1 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X1X4_T]] [[TMP10]], <1 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP6]], <1 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP8]], <1 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP10]], <1 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T:%.*]] poison, <16 x i8> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP6]], <16 x i8> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP8]], <16 x i8> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT8X16X4_T]] [[TMP10]], <16 x i8> [[TMP11]], 3
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0(ptr %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T:%.*]] poison, <8 x i16> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP6]], <8 x i16> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP8]], <8 x i16> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT16X8X4_T]] [[TMP10]], <8 x i16> [[TMP11]], 3
+; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0(ptr %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T:%.*]] poison, <4 x i32> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP6]], <4 x i32> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP8]], <4 x i32> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT32X4X4_T]] [[TMP10]], <4 x i32> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0(ptr %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T:%.*]] poison, <4 x float> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP6]], <4 x float> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP8]], <4 x float> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP10]], <4 x float> [[TMP11]], 3
+; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT32X4X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0(ptr %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T:%.*]] poison, <2 x i64> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP6]], <2 x i64> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP8]], <2 x i64> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_INT64X2X4_T]] [[TMP10]], <2 x i64> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) #0 {
+; CHECK-LABEL: define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(
+; CHECK-SAME: ptr [[ADDR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[ADDR]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP6]], <2 x double> [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP8]], <2 x double> [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[TMP4]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP10]], <2 x double> [[TMP11]], 3
+; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X2X4_T]] [[TMP12]]
+;
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
+
+define <8 x i8> @dup_ld1_from_stack(ptr %__ret) #0 {
+; CHECK-SD-LABEL: dup_ld1_from_stack:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #15
+; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_ld1_from_stack:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    add x8, sp, #15
+; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: define <8 x i8> @dup_ld1_from_stack(
+; CHECK-SAME: ptr [[__RET:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[ITEM:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ITEM]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 193514046488576
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP2]], i8 -1, i64 1, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ITEM]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[ITEM]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> poison, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i8> [[_MSPROP]], <8 x i8> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[LANE]]
+;
+entry:
+  %item = alloca i8, align 1
+  %0 = load i8, ptr %item, align 1
+  %1 = insertelement <8 x i8> poison, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> %1, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.

>From a3f91c1cd1dc8860343160188a6a464e49d09334 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Tue, 4 Mar 2025 18:31:11 +0000
Subject: [PATCH 2/3] Replace undef with poison

---
 .../MemorySanitizer/AArch64/arm64-ld1.ll      | 118 +++++++++---------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
index 673a14a9371b4..10e3402ae7482 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
@@ -1461,7 +1461,7 @@ define <8 x i8> @ld1r_8b(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i8> poison, i8 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
@@ -1481,7 +1481,7 @@ define <8 x i8> @ld1r_8b(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i8, ptr %bar
-  %tmpvar2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %tmpvar1, i32 0
   %tmpvar3 = insertelement <8 x i8> %tmpvar2, i8 %tmpvar1, i32 1
   %tmpvar4 = insertelement <8 x i8> %tmpvar3, i8 %tmpvar1, i32 2
   %tmpvar5 = insertelement <8 x i8> %tmpvar4, i8 %tmpvar1, i32 3
@@ -1509,7 +1509,7 @@ define <16 x i8> @ld1r_16b(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> undef, i8 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <16 x i8> poison, i8 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <16 x i8> [[TMPVAR2]], i8 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[_MSLD]], i32 2
@@ -1545,7 +1545,7 @@ define <16 x i8> @ld1r_16b(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i8, ptr %bar
-  %tmpvar2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %tmpvar1, i32 0
   %tmpvar3 = insertelement <16 x i8> %tmpvar2, i8 %tmpvar1, i32 1
   %tmpvar4 = insertelement <16 x i8> %tmpvar3, i8 %tmpvar1, i32 2
   %tmpvar5 = insertelement <16 x i8> %tmpvar4, i8 %tmpvar1, i32 3
@@ -1581,7 +1581,7 @@ define <4 x i16> @ld1r_4h(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i16> poison, i16 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
@@ -1593,7 +1593,7 @@ define <4 x i16> @ld1r_4h(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
-  %tmpvar2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <4 x i16> <i16 poison, i16 poison, i16 poison, i16 poison>, i16 %tmpvar1, i32 0
   %tmpvar3 = insertelement <4 x i16> %tmpvar2, i16 %tmpvar1, i32 1
   %tmpvar4 = insertelement <4 x i16> %tmpvar3, i16 %tmpvar1, i32 2
   %tmpvar5 = insertelement <4 x i16> %tmpvar4, i16 %tmpvar1, i32 3
@@ -1617,7 +1617,7 @@ define <8 x i16> @ld1r_8h(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> undef, i16 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <8 x i16> poison, i16 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <8 x i16> [[TMPVAR2]], i16 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[_MSLD]], i32 2
@@ -1637,7 +1637,7 @@ define <8 x i16> @ld1r_8h(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
-  %tmpvar2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %tmpvar1, i32 0
   %tmpvar3 = insertelement <8 x i16> %tmpvar2, i16 %tmpvar1, i32 1
   %tmpvar4 = insertelement <8 x i16> %tmpvar3, i16 %tmpvar1, i32 2
   %tmpvar5 = insertelement <8 x i16> %tmpvar4, i16 %tmpvar1, i32 3
@@ -1665,7 +1665,7 @@ define <2 x i32> @ld1r_2s(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> poison, i32 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -1673,7 +1673,7 @@ define <2 x i32> @ld1r_2s(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
-  %tmpvar2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <2 x i32> <i32 poison, i32 poison>, i32 %tmpvar1, i32 0
   %tmpvar3 = insertelement <2 x i32> %tmpvar2, i32 %tmpvar1, i32 1
   ret <2 x i32> %tmpvar3
 }
@@ -1695,7 +1695,7 @@ define <4 x i32> @ld1r_4s(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x i32> poison, i32 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
@@ -1707,7 +1707,7 @@ define <4 x i32> @ld1r_4s(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
-  %tmpvar2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 poison>, i32 %tmpvar1, i32 0
   %tmpvar3 = insertelement <4 x i32> %tmpvar2, i32 %tmpvar1, i32 1
   %tmpvar4 = insertelement <4 x i32> %tmpvar3, i32 %tmpvar1, i32 2
   %tmpvar5 = insertelement <4 x i32> %tmpvar4, i32 %tmpvar1, i32 3
@@ -1731,7 +1731,7 @@ define <2 x i64> @ld1r_2d(ptr %bar) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> undef, i64 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> poison, i64 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMPVAR2]], i64 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -1739,7 +1739,7 @@ define <2 x i64> @ld1r_2d(ptr %bar) #0 {
 ;
 ; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i64, ptr %bar
-  %tmpvar2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmpvar1, i32 0
+  %tmpvar2 = insertelement <2 x i64> <i64 poison, i64 poison>, i64 %tmpvar1, i32 0
   %tmpvar3 = insertelement <2 x i64> %tmpvar2, i64 %tmpvar1, i32 1
   ret <2 x i64> %tmpvar3
 }
@@ -2747,7 +2747,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-GI-NEXT:    str d0, [x2]
 ; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define void @ld1r_2s_from_dup(
-; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture [[B:%.*]], ptr nocapture [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
@@ -2765,9 +2765,9 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR1]], i32 0
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> poison, i32 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMPVAR2]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMPVAR2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2782,9 +2782,9 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
 ; CHECK-NEXT:    [[_MSLD2:%.*]] = load i32, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD2]], i32 0
-; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <2 x i32> undef, i32 [[TMPVAR5]], i32 0
+; CHECK-NEXT:    [[TMPVAR6:%.*]] = insertelement <2 x i32> poison, i32 [[TMPVAR5]], i32 0
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <2 x i32> [[_MSPROP3]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE1:%.*]] = shufflevector <2 x i32> [[TMPVAR6]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE1:%.*]] = shufflevector <2 x i32> [[TMPVAR6]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSPROP4]] to <8 x i8>
 ; CHECK-NEXT:    [[TMPVAR7:%.*]] = bitcast <2 x i32> [[LANE1]] to <8 x i8>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
@@ -2796,7 +2796,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP7]] to <2 x i64>
 ; CHECK-NEXT:    [[TMPVAR8:%.*]] = bitcast <8 x i16> [[SUB_I]] to <2 x i64>
 ; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <2 x i64> [[TMP15]], <2 x i64> splat (i64 -1), <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMPVAR8]], <2 x i64> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMPVAR8]], <2 x i64> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[_MSPROP8]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
 ; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP2]], 0
@@ -2814,18 +2814,18 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ;
 entry:
   %tmpvar1 = load i32, ptr %a, align 4
-  %tmpvar2 = insertelement <2 x i32> undef, i32 %tmpvar1, i32 0
-  %lane = shufflevector <2 x i32> %tmpvar2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar2 = insertelement <2 x i32> poison, i32 %tmpvar1, i32 0
+  %lane = shufflevector <2 x i32> %tmpvar2, <2 x i32> poison, <2 x i32> zeroinitializer
   %tmpvar3 = bitcast <2 x i32> %lane to <8 x i8>
   %tmpvar5 = load i32, ptr %b, align 4
-  %tmpvar6 = insertelement <2 x i32> undef, i32 %tmpvar5, i32 0
-  %lane1 = shufflevector <2 x i32> %tmpvar6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmpvar6 = insertelement <2 x i32> poison, i32 %tmpvar5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmpvar6, <2 x i32> poison, <2 x i32> zeroinitializer
   %tmpvar7 = bitcast <2 x i32> %lane1 to <8 x i8>
   %vmovl.i.i = zext <8 x i8> %tmpvar3 to <8 x i16>
   %vmovl.i4.i = zext <8 x i8> %tmpvar7 to <8 x i16>
   %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
   %tmpvar8 = bitcast <8 x i16> %sub.i to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %tmpvar8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i = shufflevector <2 x i64> %tmpvar8, <2 x i64> poison, <1 x i32> zeroinitializer
   %tmpvar9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   store <4 x i16> %tmpvar9, ptr %diff, align 8
   ret void
@@ -2834,7 +2834,7 @@ entry:
 ; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
 define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <4 x float> @ld1r_4s_float(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2850,7 +2850,7 @@ define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
@@ -2863,7 +2863,7 @@ define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load float, ptr %x, align 4
-  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
+  %tmpvar1 = insertelement <4 x float> poison, float %tmpvar, i32 0
   %tmpvar2 = insertelement <4 x float> %tmpvar1, float %tmpvar, i32 1
   %tmpvar3 = insertelement <4 x float> %tmpvar2, float %tmpvar, i32 2
   %tmpvar4 = insertelement <4 x float> %tmpvar3, float %tmpvar, i32 3
@@ -2872,7 +2872,7 @@ entry:
 
 define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <2 x float> @ld1r_2s_float(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2888,7 +2888,7 @@ define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -2897,14 +2897,14 @@ define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load float, ptr %x, align 4
-  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
+  %tmpvar1 = insertelement <2 x float> poison, float %tmpvar, i32 0
   %tmpvar2 = insertelement <2 x float> %tmpvar1, float %tmpvar, i32 1
   ret <2 x float> %tmpvar2
 }
 
 define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <2 x double> @ld1r_2d_double(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2920,7 +2920,7 @@ define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP]], i32 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -2929,14 +2929,14 @@ define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load double, ptr %x, align 4
-  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
+  %tmpvar1 = insertelement <2 x double> poison, double %tmpvar, i32 0
   %tmpvar2 = insertelement <2 x double> %tmpvar1, double %tmpvar, i32 1
   ret <2 x double> %tmpvar2
 }
 
 define <1 x double> @ld1r_1d_double(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <1 x double> @ld1r_1d_double(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2952,20 +2952,20 @@ define <1 x double> @ld1r_1d_double(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> poison, double [[TMP]], i32 0
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x double> [[TMP1]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load double, ptr %x, align 4
-  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
+  %tmpvar1 = insertelement <1 x double> poison, double %tmpvar, i32 0
   ret <1 x double> %tmpvar1
 }
 
 define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <4 x float> @ld1r_4s_float_shuff(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2981,23 +2981,23 @@ define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[_MSPROP]], <4 x i32> splat (i32 -1), <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[LANE]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load float, ptr %x, align 4
-  %tmpvar1 = insertelement <4 x float> undef, float %tmpvar, i32 0
-  %lane = shufflevector <4 x float> %tmpvar1, <4 x float> undef, <4 x i32> zeroinitializer
+  %tmpvar1 = insertelement <4 x float> poison, float %tmpvar, i32 0
+  %lane = shufflevector <4 x float> %tmpvar1, <4 x float> poison, <4 x i32> zeroinitializer
   ret <4 x float> %lane
 }
 
 define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <2 x float> @ld1r_2s_float_shuff(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3013,28 +3013,28 @@ define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x float> [[LANE]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load float, ptr %x, align 4
-  %tmpvar1 = insertelement <2 x float> undef, float %tmpvar, i32 0
-  %lane = shufflevector <2 x float> %tmpvar1, <2 x float> undef, <2 x i32> zeroinitializer
+  %tmpvar1 = insertelement <2 x float> poison, float %tmpvar, i32 0
+  %lane = shufflevector <2 x float> %tmpvar1, <2 x float> poison, <2 x i32> zeroinitializer
   ret <2 x float> %lane
 }
 
 define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <2 x double> @ld1r_2d_double_shuff(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -3045,28 +3045,28 @@ define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[_MSPROP]], <2 x i64> splat (i64 -1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMPVAR1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[LANE]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load double, ptr %x, align 4
-  %tmpvar1 = insertelement <2 x double> undef, double %tmpvar, i32 0
-  %lane = shufflevector <2 x double> %tmpvar1, <2 x double> undef, <2 x i32> zeroinitializer
+  %tmpvar1 = insertelement <2 x double> poison, double %tmpvar, i32 0
+  %lane = shufflevector <2 x double> %tmpvar1, <2 x double> poison, <2 x i32> zeroinitializer
   ret <2 x double> %lane
 }
 
 define <1 x double> @ld1r_1d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-LABEL: define <1 x double> @ld1r_1d_double_shuff(
-; CHECK-SAME: ptr nocapture [[X:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr captures(none) [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -3077,17 +3077,17 @@ define <1 x double> @ld1r_1d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> undef, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <1 x double> poison, double [[TMP]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <1 x i64> [[_MSPROP]], <1 x i64> splat (i64 -1), <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> undef, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMPVAR1]], <1 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x double> [[LANE]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
   %tmpvar = load double, ptr %x, align 4
-  %tmpvar1 = insertelement <1 x double> undef, double %tmpvar, i32 0
-  %lane = shufflevector <1 x double> %tmpvar1, <1 x double> undef, <1 x i32> zeroinitializer
+  %tmpvar1 = insertelement <1 x double> poison, double %tmpvar, i32 0
+  %lane = shufflevector <1 x double> %tmpvar1, <1 x double> poison, <1 x i32> zeroinitializer
   ret <1 x double> %lane
 }
 

>From fcb872654038bc4eae139be32d46b7cfdcdd2b0b Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 6 Mar 2025 02:26:04 +0000
Subject: [PATCH 3/3] Remove unused old assertions

---
 .../MemorySanitizer/AArch64/arm64-ld1.ll      | 480 +++++-------------
 1 file changed, 120 insertions(+), 360 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
index 10e3402ae7482..9bae334b2831f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
@@ -1,15 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=msan -S | FileCheck %s
-;
 ; Forked from llvm/test/CodeGen/AArch64/arm64-ld1.ll
-;
 ; Incorrectly handled (handleUnknownInstruction):
 ; - llvm.aarch64.neon.ld1x2, llvm.aarch64.neon.ld1x3, llvm.aarch64.neon.ld1x4
 ; - llvm.aarch64.neon.ld2, llvm.aarch64.neon.ld3, llvm.aarch64.neon.ld4
 ; - llvm.aarch64.neon.ld2lane, llvm.aarch64.neon.ld3lane, llvm.aarch64.neon.ld4lane
 ; - llvm.aarch64.neon.ld2r, llvm.aarch64.neon.ld3r, llvm.aarch64.neon.ld4r
 
-
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
 
@@ -18,6 +15,8 @@ target triple = "aarch64--linux-android9001"
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
 
 define %struct.__neon_int8x8x2_t @ld2_8b(ptr %A) nounwind #0 {
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -36,13 +35,12 @@ define %struct.__neon_int8x8x2_t @ld2_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
 ;
-; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
-; and from the argument of the function also defined by ABI (i.e., x0)
   %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x2_t  %tmpvar2
 }
 
 define %struct.__neon_int8x8x3_t @ld3_8b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -63,12 +61,12 @@ define %struct.__neon_int8x8x3_t @ld3_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x3_t  %tmpvar2
 }
 
 define %struct.__neon_int8x8x4_t @ld4_8b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -91,7 +89,6 @@ define %struct.__neon_int8x8x4_t @ld4_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x4_t  %tmpvar2
 }
@@ -105,6 +102,7 @@ declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0(ptr) nounwind r
 %struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
 
 define %struct.__neon_int8x16x2_t @ld2_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -123,12 +121,12 @@ define %struct.__neon_int8x16x2_t @ld2_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x2_t  %tmpvar2
 }
 
 define %struct.__neon_int8x16x3_t @ld3_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -149,12 +147,12 @@ define %struct.__neon_int8x16x3_t @ld3_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x3_t  %tmpvar2
 }
 
 define %struct.__neon_int8x16x4_t @ld4_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -177,7 +175,6 @@ define %struct.__neon_int8x16x4_t @ld4_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x4_t  %tmpvar2
 }
@@ -191,6 +188,7 @@ declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0(ptr) nounwind
 %struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
 
 define %struct.__neon_int16x4x2_t @ld2_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -209,12 +207,12 @@ define %struct.__neon_int16x4x2_t @ld2_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x2_t  %tmpvar2
 }
 
 define %struct.__neon_int16x4x3_t @ld3_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -235,12 +233,12 @@ define %struct.__neon_int16x4x3_t @ld3_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x3_t  %tmpvar2
 }
 
 define %struct.__neon_int16x4x4_t @ld4_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -263,7 +261,6 @@ define %struct.__neon_int16x4x4_t @ld4_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x4_t  %tmpvar2
 }
@@ -277,6 +274,7 @@ declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0(ptr) nounwind
 %struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
 
 define %struct.__neon_int16x8x2_t @ld2_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -295,12 +293,12 @@ define %struct.__neon_int16x8x2_t @ld2_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x2_t  %tmpvar2
 }
 
 define %struct.__neon_int16x8x3_t @ld3_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -321,12 +319,12 @@ define %struct.__neon_int16x8x3_t @ld3_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x3_t %tmpvar2
 }
 
 define %struct.__neon_int16x8x4_t @ld4_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -349,7 +347,6 @@ define %struct.__neon_int16x8x4_t @ld4_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x4_t  %tmpvar2
 }
@@ -363,6 +360,7 @@ declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0(ptr) nounwind
 %struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
 
 define %struct.__neon_int32x2x2_t @ld2_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -381,12 +379,12 @@ define %struct.__neon_int32x2x2_t @ld2_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x2_t  %tmpvar2
 }
 
 define %struct.__neon_int32x2x3_t @ld3_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -407,12 +405,12 @@ define %struct.__neon_int32x2x3_t @ld3_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x3_t  %tmpvar2
 }
 
 define %struct.__neon_int32x2x4_t @ld4_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -435,7 +433,6 @@ define %struct.__neon_int32x2x4_t @ld4_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x4_t  %tmpvar2
 }
@@ -449,6 +446,7 @@ declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0(ptr) nounwind
 %struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
 
 define %struct.__neon_int32x4x2_t @ld2_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -467,12 +465,12 @@ define %struct.__neon_int32x4x2_t @ld2_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x2_t  %tmpvar2
 }
 
 define %struct.__neon_int32x4x3_t @ld3_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -493,12 +491,12 @@ define %struct.__neon_int32x4x3_t @ld3_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x3_t  %tmpvar2
 }
 
 define %struct.__neon_int32x4x4_t @ld4_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -521,7 +519,6 @@ define %struct.__neon_int32x4x4_t @ld4_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x4_t  %tmpvar2
 }
@@ -535,6 +532,7 @@ declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0(ptr) nounwind
 %struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
 
 define %struct.__neon_int64x2x2_t @ld2_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -553,12 +551,12 @@ define %struct.__neon_int64x2x2_t @ld2_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x2_t  %tmpvar2
 }
 
 define %struct.__neon_int64x2x3_t @ld3_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -579,12 +577,12 @@ define %struct.__neon_int64x2x3_t @ld3_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x3_t  %tmpvar2
 }
 
 define %struct.__neon_int64x2x4_t @ld4_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -607,7 +605,6 @@ define %struct.__neon_int64x2x4_t @ld4_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x4_t  %tmpvar2
 }
@@ -622,6 +619,7 @@ declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0(ptr) nounwind
 
 
 define %struct.__neon_int64x1x2_t @ld2_1di64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2_1di64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -640,12 +638,12 @@ define %struct.__neon_int64x1x2_t @ld2_1di64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x2_t  %tmpvar2
 }
 
 define %struct.__neon_int64x1x3_t @ld3_1di64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3_1di64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -666,12 +664,12 @@ define %struct.__neon_int64x1x3_t @ld3_1di64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x3_t  %tmpvar2
 }
 
 define %struct.__neon_int64x1x4_t @ld4_1di64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4_1di64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -694,7 +692,6 @@ define %struct.__neon_int64x1x4_t @ld4_1di64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x4_t  %tmpvar2
 }
@@ -710,6 +707,7 @@ declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0(ptr) nounwind
 
 
 define %struct.__neon_float64x1x2_t @ld2_1df64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_float64x1x2_t @ld2_1df64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -728,12 +726,12 @@ define %struct.__neon_float64x1x2_t @ld2_1df64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0(ptr %A)
   ret %struct.__neon_float64x1x2_t  %tmpvar2
 }
 
 define %struct.__neon_float64x1x3_t @ld3_1df64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_float64x1x3_t @ld3_1df64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -754,12 +752,12 @@ define %struct.__neon_float64x1x3_t @ld3_1df64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0(ptr %A)
   ret %struct.__neon_float64x1x3_t  %tmpvar2
 }
 
 define %struct.__neon_float64x1x4_t @ld4_1df64(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_float64x1x4_t @ld4_1df64(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -782,7 +780,6 @@ define %struct.__neon_float64x1x4_t @ld4_1df64(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_FLOAT64X1X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr %A)
   ret %struct.__neon_float64x1x4_t  %tmpvar2
 }
@@ -794,19 +791,6 @@ declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr) nounwi
 
 define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld2lane_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ld2.b { v0, v1 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld2lane_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ld2.b { v0, v1 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
@@ -839,21 +823,6 @@ define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr
 
 define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld3lane_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld3lane_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    ld3.b { v0, v1, v2 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
@@ -892,23 +861,6 @@ define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 
 define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld4lane_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld4lane_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    ld4.b { v0, v1, v2, v3 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], <16 x i8> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
@@ -957,19 +909,6 @@ declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8>
 
 define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld2lane_8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ld2.h { v0, v1 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld2lane_8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ld2.h { v0, v1 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
@@ -1002,21 +941,6 @@ define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr
 
 define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld3lane_8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld3lane_8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    ld3.h { v0, v1, v2 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
@@ -1055,23 +979,6 @@ define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 
 define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld4lane_8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld4lane_8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    ld4.h { v0, v1, v2, v3 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], <8 x i16> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
@@ -1120,19 +1027,6 @@ declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16>
 
 define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld2lane_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ld2.s { v0, v1 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld2lane_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ld2.s { v0, v1 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
@@ -1165,21 +1059,6 @@ define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr
 
 define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld3lane_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld3lane_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    ld3.s { v0, v1, v2 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
@@ -1218,23 +1097,6 @@ define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 
 define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld4lane_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld4lane_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    ld4.s { v0, v1, v2, v3 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], <4 x i32> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
@@ -1283,19 +1145,6 @@ declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32>
 
 define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld2lane_2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ld2.d { v0, v1 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld2lane_2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ld2.d { v0, v1 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
@@ -1328,21 +1177,6 @@ define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr
 
 define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld3lane_2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-SD-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld3lane_2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-GI-NEXT:    ld3.d { v0, v1, v2 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
@@ -1381,23 +1215,6 @@ define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 
 define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, ptr %A) nounwind #0 {
 ; Make sure we are using the operands defined by the ABI
-; CHECK-SD-LABEL: ld4lane_2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-SD-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld4lane_2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-GI-NEXT:    ld4.d { v0, v1, v2, v3 }[1], [x0]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], <2 x i64> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
@@ -1445,6 +1262,7 @@ declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64>
 declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readonly
 
 define <8 x i8> @ld1r_8b(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i8> @ld1r_8b(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1475,11 +1293,10 @@ define <8 x i8> @ld1r_8b(ptr %bar) #0 {
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[_MSLD]], i32 6
 ; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i8> [[TMPVAR7]], i8 [[TMPVAR1]], i32 6
 ; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    [[TMPVAR9:%.*]] = insertelement <8 x i8> [[TMPVAR8]], i8 [[TMPVAR1]], i32 7
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i8> [[TMP9]]
+; CHECK-NEXT:    ret <8 x i8> [[TMPVAR9]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i8, ptr %bar
   %tmpvar2 = insertelement <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %tmpvar1, i32 0
   %tmpvar3 = insertelement <8 x i8> %tmpvar2, i8 %tmpvar1, i32 1
@@ -1493,6 +1310,7 @@ define <8 x i8> @ld1r_8b(ptr %bar) #0 {
 }
 
 define <16 x i8> @ld1r_16b(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <16 x i8> @ld1r_16b(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1539,11 +1357,10 @@ define <16 x i8> @ld1r_16b(ptr %bar) #0 {
 ; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[_MSLD]], i32 14
 ; CHECK-NEXT:    [[TMPVAR16:%.*]] = insertelement <16 x i8> [[TMPVAR15]], i8 [[TMPVAR1]], i32 14
 ; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[_MSLD]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> [[TMPVAR16]], i8 [[TMPVAR1]], i32 15
+; CHECK-NEXT:    [[TMPVAR17:%.*]] = insertelement <16 x i8> [[TMPVAR16]], i8 [[TMPVAR1]], i32 15
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP15]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[TMP17]]
+; CHECK-NEXT:    ret <16 x i8> [[TMPVAR17]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i8, ptr %bar
   %tmpvar2 = insertelement <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %tmpvar1, i32 0
   %tmpvar3 = insertelement <16 x i8> %tmpvar2, i8 %tmpvar1, i32 1
@@ -1565,6 +1382,7 @@ define <16 x i8> @ld1r_16b(ptr %bar) #0 {
 }
 
 define <4 x i16> @ld1r_4h(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i16> @ld1r_4h(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1591,7 +1409,6 @@ define <4 x i16> @ld1r_4h(ptr %bar) #0 {
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMPVAR5]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
   %tmpvar2 = insertelement <4 x i16> <i16 poison, i16 poison, i16 poison, i16 poison>, i16 %tmpvar1, i32 0
   %tmpvar3 = insertelement <4 x i16> %tmpvar2, i16 %tmpvar1, i32 1
@@ -1601,6 +1418,7 @@ define <4 x i16> @ld1r_4h(ptr %bar) #0 {
 }
 
 define <8 x i16> @ld1r_8h(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i16> @ld1r_8h(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1631,11 +1449,10 @@ define <8 x i16> @ld1r_8h(ptr %bar) #0 {
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[_MSLD]], i32 6
 ; CHECK-NEXT:    [[TMPVAR8:%.*]] = insertelement <8 x i16> [[TMPVAR7]], i16 [[TMPVAR1]], i32 6
 ; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i16> [[TMPVAR8]], i16 [[TMPVAR1]], i32 7
+; CHECK-NEXT:    [[TMPVAR9:%.*]] = insertelement <8 x i16> [[TMPVAR8]], i16 [[TMPVAR1]], i32 7
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[TMP9]]
+; CHECK-NEXT:    ret <8 x i16> [[TMPVAR9]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
   %tmpvar2 = insertelement <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>, i16 %tmpvar1, i32 0
   %tmpvar3 = insertelement <8 x i16> %tmpvar2, i16 %tmpvar1, i32 1
@@ -1649,12 +1466,13 @@ define <8 x i16> @ld1r_8h(ptr %bar) #0 {
 }
 
 define <2 x i32> @ld1r_2s(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i32> @ld1r_2s(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
 ; CHECK:       2:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -1667,11 +1485,10 @@ define <2 x i32> @ld1r_2s(ptr %bar) #0 {
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i32> poison, i32 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <2 x i32> [[TMPVAR2]], i32 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+; CHECK-NEXT:    ret <2 x i32> [[TMPVAR3]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
   %tmpvar2 = insertelement <2 x i32> <i32 poison, i32 poison>, i32 %tmpvar1, i32 0
   %tmpvar3 = insertelement <2 x i32> %tmpvar2, i32 %tmpvar1, i32 1
@@ -1679,6 +1496,7 @@ define <2 x i32> @ld1r_2s(ptr %bar) #0 {
 }
 
 define <4 x i32> @ld1r_4s(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i32> @ld1r_4s(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1705,7 +1523,6 @@ define <4 x i32> @ld1r_4s(ptr %bar) #0 {
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMPVAR5]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
   %tmpvar2 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 poison>, i32 %tmpvar1, i32 0
   %tmpvar3 = insertelement <4 x i32> %tmpvar2, i32 %tmpvar1, i32 1
@@ -1715,12 +1532,13 @@ define <4 x i32> @ld1r_4s(ptr %bar) #0 {
 }
 
 define <2 x i64> @ld1r_2d(ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i64> @ld1r_2d(
 ; CHECK-SAME: ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
 ; CHECK:       2:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -1733,11 +1551,10 @@ define <2 x i64> @ld1r_2d(ptr %bar) #0 {
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
 ; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x i64> poison, i64 [[TMPVAR1]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMPVAR2]], i64 [[TMPVAR1]], i32 1
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <2 x i64> [[TMPVAR2]], i64 [[TMPVAR1]], i32 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-NEXT:    ret <2 x i64> [[TMPVAR3]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i64, ptr %bar
   %tmpvar2 = insertelement <2 x i64> <i64 poison, i64 poison>, i64 %tmpvar1, i32 0
   %tmpvar3 = insertelement <2 x i64> %tmpvar2, i64 %tmpvar1, i32 1
@@ -1745,6 +1562,7 @@ define <2 x i64> @ld1r_2d(ptr %bar) #0 {
 }
 
 define %struct.__neon_int8x8x2_t @ld2r_8b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x8x2_t @ld2r_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1763,12 +1581,12 @@ define %struct.__neon_int8x8x2_t @ld2r_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x2_t  %tmpvar2
 }
 
 define %struct.__neon_int8x8x3_t @ld3r_8b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x8x3_t @ld3r_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1789,12 +1607,12 @@ define %struct.__neon_int8x8x3_t @ld3r_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x3_t  %tmpvar2
 }
 
 define %struct.__neon_int8x8x4_t @ld4r_8b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x8x4_t @ld4r_8b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1817,7 +1635,6 @@ define %struct.__neon_int8x8x4_t @ld4r_8b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X8X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr %A)
   ret %struct.__neon_int8x8x4_t  %tmpvar2
 }
@@ -1827,6 +1644,7 @@ declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0(ptr) nounwind
 declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0(ptr) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld2r_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2r_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1845,12 +1663,12 @@ define %struct.__neon_int8x16x2_t @ld2r_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x2_t  %tmpvar2
 }
 
 define %struct.__neon_int8x16x3_t @ld3r_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3r_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1871,12 +1689,12 @@ define %struct.__neon_int8x16x3_t @ld3r_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x3_t  %tmpvar2
 }
 
 define %struct.__neon_int8x16x4_t @ld4r_16b(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4r_16b(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1899,7 +1717,6 @@ define %struct.__neon_int8x16x4_t @ld4r_16b(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT8X16X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr %A)
   ret %struct.__neon_int8x16x4_t  %tmpvar2
 }
@@ -1909,6 +1726,7 @@ declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0(ptr) nounwin
 declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0(ptr) nounwind readonly
 
 define %struct.__neon_int16x4x2_t @ld2r_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x2_t @ld2r_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1927,12 +1745,12 @@ define %struct.__neon_int16x4x2_t @ld2r_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x2_t  %tmpvar2
 }
 
 define %struct.__neon_int16x4x3_t @ld3r_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x3_t @ld3r_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1953,12 +1771,12 @@ define %struct.__neon_int16x4x3_t @ld3r_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x3_t  %tmpvar2
 }
 
 define %struct.__neon_int16x4x4_t @ld4r_4h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x4x4_t @ld4r_4h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -1981,7 +1799,6 @@ define %struct.__neon_int16x4x4_t @ld4r_4h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X4X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr %A)
   ret %struct.__neon_int16x4x4_t  %tmpvar2
 }
@@ -1991,6 +1808,7 @@ declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0(ptr) nounwin
 declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0(ptr) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2r_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2r_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2009,12 +1827,12 @@ define %struct.__neon_int16x8x2_t @ld2r_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x2_t  %tmpvar2
 }
 
 define %struct.__neon_int16x8x3_t @ld3r_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3r_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2035,12 +1853,12 @@ define %struct.__neon_int16x8x3_t @ld3r_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x3_t  %tmpvar2
 }
 
 define %struct.__neon_int16x8x4_t @ld4r_8h(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4r_8h(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2063,7 +1881,6 @@ define %struct.__neon_int16x8x4_t @ld4r_8h(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT16X8X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr %A)
   ret %struct.__neon_int16x8x4_t  %tmpvar2
 }
@@ -2073,6 +1890,7 @@ declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0(ptr) nounwin
 declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0(ptr) nounwind readonly
 
 define %struct.__neon_int32x2x2_t @ld2r_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x2_t @ld2r_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2091,12 +1909,12 @@ define %struct.__neon_int32x2x2_t @ld2r_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x2_t  %tmpvar2
 }
 
 define %struct.__neon_int32x2x3_t @ld3r_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x3_t @ld3r_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2117,12 +1935,12 @@ define %struct.__neon_int32x2x3_t @ld3r_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x3_t  %tmpvar2
 }
 
 define %struct.__neon_int32x2x4_t @ld4r_2s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x2x4_t @ld4r_2s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2145,7 +1963,6 @@ define %struct.__neon_int32x2x4_t @ld4r_2s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X2X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr %A)
   ret %struct.__neon_int32x2x4_t  %tmpvar2
 }
@@ -2155,6 +1972,7 @@ declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0(ptr) nounwin
 declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0(ptr) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2r_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2r_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2173,12 +1991,12 @@ define %struct.__neon_int32x4x2_t @ld2r_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x2_t  %tmpvar2
 }
 
 define %struct.__neon_int32x4x3_t @ld3r_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3r_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2199,12 +2017,12 @@ define %struct.__neon_int32x4x3_t @ld3r_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x3_t  %tmpvar2
 }
 
 define %struct.__neon_int32x4x4_t @ld4r_4s(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4r_4s(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2227,7 +2045,6 @@ define %struct.__neon_int32x4x4_t @ld4r_4s(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT32X4X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr %A)
   ret %struct.__neon_int32x4x4_t  %tmpvar2
 }
@@ -2237,6 +2054,7 @@ declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0(ptr) nounwin
 declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0(ptr) nounwind readonly
 
 define %struct.__neon_int64x1x2_t @ld2r_1d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x2_t @ld2r_1d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2255,12 +2073,12 @@ define %struct.__neon_int64x1x2_t @ld2r_1d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x2_t  %tmpvar2
 }
 
 define %struct.__neon_int64x1x3_t @ld3r_1d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x3_t @ld3r_1d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2281,12 +2099,12 @@ define %struct.__neon_int64x1x3_t @ld3r_1d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x3_t  %tmpvar2
 }
 
 define %struct.__neon_int64x1x4_t @ld4r_1d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x1x4_t @ld4r_1d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2309,7 +2127,6 @@ define %struct.__neon_int64x1x4_t @ld4r_1d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X1X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %A)
   ret %struct.__neon_int64x1x4_t  %tmpvar2
 }
@@ -2319,6 +2136,7 @@ declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0(ptr) nounwin
 declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0(ptr) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2r_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2r_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2337,12 +2155,12 @@ define %struct.__neon_int64x2x2_t @ld2r_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X2_T]] [[TMP8]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x2_t  %tmpvar2
 }
 
 define %struct.__neon_int64x2x3_t @ld3r_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3r_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2363,12 +2181,12 @@ define %struct.__neon_int64x2x3_t @ld3r_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X3_T]] [[TMP10]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x3_t  %tmpvar2
 }
 
 define %struct.__neon_int64x2x4_t @ld4r_2d(ptr %A) nounwind #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4r_2d(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2391,7 +2209,6 @@ define %struct.__neon_int64x2x4_t @ld4r_2d(ptr %A) nounwind #0 {
 ; CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret [[STRUCT___NEON_INT64X2X4_T]] [[TMP12]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %A)
   ret %struct.__neon_int64x2x4_t  %tmpvar2
 }
@@ -2401,16 +2218,6 @@ declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwin
 declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly
 
 define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) #0 {
-; CHECK-SD-LABEL: ld1_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld1_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b1, [x0]
-; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
-; CHECK-GI-NEXT:    ret
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <16 x i8> @ld1_16b(
 ; CHECK-SAME: <16 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
@@ -2439,6 +2246,7 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) #0 {
 }
 
 define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i16> @ld1_8h(
 ; CHECK-SAME: <8 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
@@ -2460,13 +2268,13 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
   %tmpvar2 = insertelement <8 x i16> %V, i16 %tmpvar1, i32 0
   ret <8 x i16> %tmpvar2
 }
 
 define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i32> @ld1_4s(
 ; CHECK-SAME: <4 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
@@ -2488,13 +2296,13 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
   %tmpvar2 = insertelement <4 x i32> %V, i32 %tmpvar1, i32 0
   ret <4 x i32> %tmpvar2
 }
 
 define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x float> @ld1_4s_float(
 ; CHECK-SAME: <4 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
@@ -2516,13 +2324,13 @@ define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load float, ptr %bar
   %tmpvar2 = insertelement <4 x float> %V, float %tmpvar1, i32 0
   ret <4 x float> %tmpvar2
 }
 
 define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i64> @ld1_2d(
 ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
@@ -2544,13 +2352,13 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i64, ptr %bar
   %tmpvar2 = insertelement <2 x i64> %V, i64 %tmpvar1, i32 0
   ret <2 x i64> %tmpvar2
 }
 
 define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x double> @ld1_2d_double(
 ; CHECK-SAME: <2 x double> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
@@ -2572,13 +2380,13 @@ define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load double, ptr %bar
   %tmpvar2 = insertelement <2 x double> %V, double %tmpvar1, i32 0
   ret <2 x double> %tmpvar2
 }
 
 define <1 x i64> @ld1_1d(ptr %p) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <1 x i64> @ld1_1d(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
@@ -2589,34 +2397,19 @@ define <1 x i64> @ld1_1d(ptr %p) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP:%.*]] = load <1 x i64>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load <1 x i64>, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    store <1 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <1 x i64> [[TMP]]
+; CHECK-NEXT:    ret <1 x i64> [[TMPVAR]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar = load <1 x i64>, ptr %p, align 8
   ret <1 x i64> %tmpvar
 }
 
 define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) #0 {
-; CHECK-SD-LABEL: ld1_8b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x0]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld1_8b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b1, [x0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov.b v0[0], v1[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i8> @ld1_8b(
 ; CHECK-SAME: <8 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
@@ -2645,6 +2438,7 @@ define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) #0 {
 }
 
 define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i16> @ld1_4h(
 ; CHECK-SAME: <4 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
@@ -2666,13 +2460,13 @@ define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i16, ptr %bar
   %tmpvar2 = insertelement <4 x i16> %V, i16 %tmpvar1, i32 0
   ret <4 x i16> %tmpvar2
 }
 
 define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i32> @ld1_2s(
 ; CHECK-SAME: <2 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
@@ -2694,13 +2488,13 @@ define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load i32, ptr %bar
   %tmpvar2 = insertelement <2 x i32> %V, i32 %tmpvar1, i32 0
   ret <2 x i32> %tmpvar2
 }
 
 define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
+; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x float> @ld1_2s_float(
 ; CHECK-SAME: <2 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
@@ -2722,7 +2516,6 @@ define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x float> [[TMPVAR2]]
 ;
-; Make sure we are using the operands defined by the ABI
   %tmpvar1 = load float, ptr %bar
   %tmpvar2 = insertelement <2 x float> %V, float %tmpvar1, i32 0
   ret <2 x float> %tmpvar2
@@ -2731,21 +2524,6 @@ define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
 
 ; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
 define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture %diff) nounwind ssp #0 {
-; CHECK-SD-LABEL: ld1r_2s_from_dup:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr s0, [x0]
-; CHECK-SD-NEXT:    ldr s1, [x1]
-; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
-; CHECK-SD-NEXT:    str d0, [x2]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ld1r_2s_from_dup:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ld1r.2s { v0 }, [x0]
-; CHECK-GI-NEXT:    ld1r.2s { v1 }, [x1]
-; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
-; CHECK-GI-NEXT:    str d0, [x2]
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define void @ld1r_2s_from_dup(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -2754,7 +2532,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP22:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -2769,9 +2547,9 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMPVAR2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 ; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP9]], label [[TMP23:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
@@ -2788,7 +2566,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSPROP4]] to <8 x i8>
 ; CHECK-NEXT:    [[TMPVAR7:%.*]] = bitcast <2 x i32> [[LANE1]] to <8 x i8>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
-; CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[TMPVAR3]] to <8 x i16>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i16>
 ; CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[TMPVAR7]] to <8 x i16>
 ; CHECK-NEXT:    [[_MSPROP7:%.*]] = or <8 x i16> [[_MSPROP5]], [[_MSPROP6]]
@@ -2798,7 +2576,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <2 x i64> [[TMP15]], <2 x i64> splat (i64 -1), <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMPVAR8]], <2 x i64> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[_MSPROP8]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
+; CHECK-NEXT:    [[TMPVAR9:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
 ; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP10]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
 ; CHECK:       17:
@@ -2809,7 +2587,7 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 193514046488576
 ; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
 ; CHECK-NEXT:    store <4 x i16> [[TMP16]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[DIFF]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMPVAR9]], ptr [[DIFF]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2839,24 +2617,24 @@ define <4 x float> @ld1r_4s_float(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load float, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <4 x float> poison, float [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <4 x float> [[TMPVAR1]], float [[TMPVAR]], i32 1
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
-; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP]], i32 2
+; CHECK-NEXT:    [[TMPVAR3:%.*]] = insertelement <4 x float> [[TMPVAR2]], float [[TMPVAR]], i32 2
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
-; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x float> [[TMPVAR3]], float [[TMP]], i32 3
+; CHECK-NEXT:    [[TMPVAR4:%.*]] = insertelement <4 x float> [[TMPVAR3]], float [[TMPVAR]], i32 3
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[TMPVAR4]]
 ;
@@ -2877,22 +2655,22 @@ define <2 x float> @ld1r_2s_float(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load float, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x float> poison, float [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP]], i32 1
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x float> [[TMPVAR1]], float [[TMPVAR]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+; CHECK-NEXT:    ret <2 x float> [[TMPVAR2]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
@@ -2909,22 +2687,22 @@ define <2 x double> @ld1r_2d_double(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load double, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x double> poison, double [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP]], i32 1
+; CHECK-NEXT:    [[TMPVAR2:%.*]] = insertelement <2 x double> [[TMPVAR1]], double [[TMPVAR]], i32 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMPVAR2]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
@@ -2941,20 +2719,20 @@ define <1 x double> @ld1r_1d_double(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load double, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <1 x double> poison, double [[TMPVAR]], i32 0
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <1 x double> [[TMP1]]
+; CHECK-NEXT:    ret <1 x double> [[TMPVAR1]]
 ;
 entry:
 ; Make sure we are using the operands defined by the ABI
@@ -2970,20 +2748,20 @@ define <4 x float> @ld1r_4s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load float, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <4 x float> poison, float [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[_MSPROP]], <4 x i32> splat (i32 -1), <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMPVAR1]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[LANE]]
 ;
@@ -3002,20 +2780,20 @@ define <2 x float> @ld1r_2s_float_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP1:%.*]], label [[TMP2:%.*]], !prof [[PROF1]]
 ; CHECK:       1:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load float, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x float> poison, float [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i32> [[_MSPROP]], <2 x i32> splat (i32 -1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMPVAR1]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x float> [[LANE]]
 ;
@@ -3039,13 +2817,13 @@ define <2 x double> @ld1r_2d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load double, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <2 x double> poison, double [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[_MSPROP]], <2 x i64> splat (i64 -1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMPVAR1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -3071,13 +2849,13 @@ define <1 x double> @ld1r_1d_double_shuff(ptr nocapture %x) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMPVAR:%.*]] = load double, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <1 x double> poison, double [[TMP]], i32 0
+; CHECK-NEXT:    [[TMPVAR1:%.*]] = insertelement <1 x double> poison, double [[TMPVAR]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <1 x i64> [[_MSPROP]], <1 x i64> splat (i64 -1), <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMPVAR1]], <1 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
@@ -4047,24 +3825,6 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) #0 {
 }
 
 define <8 x i8> @dup_ld1_from_stack(ptr %__ret) #0 {
-; CHECK-SD-LABEL: dup_ld1_from_stack:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #16
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    add x8, sp, #15
-; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-SD-NEXT:    add sp, sp, #16
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: dup_ld1_from_stack:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    add x8, sp, #15
-; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
 ; CHECK-LABEL: define <8 x i8> @dup_ld1_from_stack(
 ; CHECK-SAME: ptr [[__RET:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:



More information about the llvm-commits mailing list