[llvm] [msan] Add test for Arm NEON tbl intrinsics (PR #114462)

Thu Oct 31 15:57:54 PDT 2024

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/114462

>From 4d47f0b42f5112e111c93395488b0afa7d5027e5 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 31 Oct 2024 20:33:16 +0000
Subject: [PATCH 1/4] [msan] Add test for Arm NEON tbl intrinsics

Arm NEON tbl intrinsics are currently not instrumented properly by MSan.
This test will help track progress.
---
 .../MemorySanitizer/AArch64/neon_tbl.ll       | 509 ++++++++++++++++++
 1 file changed, 509 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll
new file mode 100644
index 00000000000000..42f79fbf027483
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll
@@ -0,0 +1,509 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
+; Test memory sanitizer instrumentation for Arm NEON tbl instructions.
+;
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-tbl.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK-LABEL: define <8 x i8> @tbl1_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK-LABEL: define <16 x i8> @tbl1_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: define <8 x i8> @tbl2_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: define <16 x i8> @tbl2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK-LABEL: define <8 x i8> @tbl3_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK-LABEL: define <16 x i8> @tbl3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK-LABEL: define <8 x i8> @tbl4_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK-LABEL: define <16 x i8> @tbl4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %out
+}
+
+
+
+define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[S]]
+;
+  %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 1, i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 1, i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 1, i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 1, i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 1, i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 1, i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 [[V]], i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 [[V]], i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 1, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 1, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 1, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 1, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 1, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 %v, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 %v, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 %v, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK-LABEL: define <8 x i8> @tbx1_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK-LABEL: define <16 x i8> @tbx1_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK-LABEL: define <8 x i8> @tbx2_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK-LABEL: define <16 x i8> @tbx2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK-LABEL: define <8 x i8> @tbx3_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK-LABEL: define <16 x i8> @tbx3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK-LABEL: define <8 x i8> @tbx4_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK-LABEL: define <16 x i8> @tbx4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %out
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

>From 1cb3f14fa11c217b3b6cddcb3a94d42a73d69daf Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 31 Oct 2024 22:34:42 +0000
Subject: [PATCH 2/4] Add origins-tracking test

---
 .../AArch64/neon_tbl_origins.ll               | 534 ++++++++++++++++++
 1 file changed, 534 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
new file mode 100644
index 00000000000000..44c0a278de96f3
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
@@ -0,0 +1,534 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
+; Test memory sanitizer instrumentation for Arm NEON tbl instructions.
+;
+; RUN: build/bin/opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-tbl.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+; -----------------------------------------------------------------------------------------------------------------------------------------------
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK-LABEL: define <8 x i8> @tbl1_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK-LABEL: define <16 x i8> @tbl1_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: define <8 x i8> @tbl2_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: define <16 x i8> @tbl2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK-LABEL: define <8 x i8> @tbl3_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK-LABEL: define <16 x i8> @tbl3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK-LABEL: define <8 x i8> @tbl4_8b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK-LABEL: define <16 x i8> @tbl4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %out
+}
+
+
+
+define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[S]]
+;
+  %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 1, i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 1, i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 1, i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 1, i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 1, i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 1, i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 [[V]], i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 [[V]], i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 1, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 1, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 1, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 1, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 1, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 %v, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 %v, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14
+; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+  %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+  %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+  %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+  %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+  %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+  %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+  %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+  %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+  %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+  %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+  %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+  %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+  %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+  %ins.14 = insertelement <16 x i8> %ins.13, i8 %v, i32 14
+  %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK-LABEL: define <8 x i8> @tbx1_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK-LABEL: define <16 x i8> @tbx1_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK-LABEL: define <8 x i8> @tbx2_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK-LABEL: define <16 x i8> @tbx2_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK-LABEL: define <8 x i8> @tbx3_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK-LABEL: define <16 x i8> @tbx3_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %out
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK-LABEL: define <8 x i8> @tbx4_8b
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]])
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i8> [[OUT]]
+;
+  %out = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %out
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK-LABEL: define <16 x i8> @tbx4_16b
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]])
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <16 x i8> [[OUT]]
+;
+  %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %out
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

>From 516d4aa80d9cc275906d08679c9ca238f5b3e938 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 31 Oct 2024 22:36:30 +0000
Subject: [PATCH 3/4] Fix tool name

---
 .../Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
index 44c0a278de96f3..bb22277ef4b055 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
 ; Test memory sanitizer instrumentation for Arm NEON tbl instructions.
 ;
-; RUN: build/bin/opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
+; RUN: opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s
 ;
 ; Forked from llvm/test/CodeGen/AArch64/arm64-tbl.ll
 

>From e7cd4721c66e607890dab98fb86299cb75752408 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 31 Oct 2024 22:57:17 +0000
Subject: [PATCH 4/4] Add 'sanitize_memory' annotations to tests

---
 .../MemorySanitizer/AArch64/neon_tbl.ll       | 524 ++++++++--
 .../AArch64/neon_tbl_origins.ll               | 978 ++++++++++++++++--
 2 files changed, 1330 insertions(+), 172 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll
index 42f79fbf027483..2505a3e95fe02e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll
@@ -10,10 +10,22 @@ target triple = "aarch64--linux-android9001"
 
 ; -----------------------------------------------------------------------------------------------------------------------------------------------
 
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl1_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -22,22 +34,41 @@ define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl2_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -46,22 +77,47 @@ define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl2_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl3_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -70,22 +126,53 @@ define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl3_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl4_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -94,12 +181,21 @@ define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl4_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
@@ -108,11 +204,35 @@ define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 
 
 
-define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
 ; CHECK-NEXT:    [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
@@ -126,14 +246,23 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -143,30 +272,56 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c
 }
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], [[_MSPROP15]]
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -192,9 +347,14 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 }
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1
@@ -208,14 +368,23 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 [[V]], i32 12
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 [[V]], i32 13
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <16 x i8> [[_MSPROP4]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = or <16 x i8> [[_MSPROP6]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <16 x i8> [[_MSPROP5]], <16 x i8> [[_MSPROP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
@@ -242,30 +411,56 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]]
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -292,30 +487,56 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[TMP1]], i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[TMP1]], i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]]
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -342,14 +563,23 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -360,14 +590,23 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -378,14 +617,23 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8>
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -394,19 +642,35 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8>
   ret <16 x i8> %s
 }
 
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
 
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx1_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -415,22 +679,47 @@ define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx2_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -439,22 +728,53 @@ define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx2_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx3_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -463,22 +783,59 @@ define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx3_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx4_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[OUT]]
@@ -487,23 +844,34 @@ define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx4_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i8> [[_MSPROP3]], [[TMP6]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
   ret <16 x i8> %out
 }
 
-declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
index bb22277ef4b055..034437a8d7beb7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl_origins.ll
@@ -10,10 +10,28 @@ target triple = "aarch64--linux-android9001"
 
 ; -----------------------------------------------------------------------------------------------------------------------------------------------
 
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl1_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -23,23 +41,58 @@ define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 [[TMP4]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP7]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl2_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -49,23 +102,73 @@ define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl2_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP6]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP12]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl3_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -75,23 +178,88 @@ define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl3_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP6]], i32 [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP8]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl4_8b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP9]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP23]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -101,13 +269,39 @@ define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbl4_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP6]], i32 [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP8]], i32 [[TMP16]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP10]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP22]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
@@ -116,11 +310,47 @@ define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 
 
 
-define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
 ; CHECK-NEXT:    [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
 ; CHECK-NEXT:    [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
@@ -135,15 +365,37 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP8]], i32 [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -153,31 +405,90 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c
 }
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP2]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP2]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP2]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP2]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP2]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP2]], i32 [[TMP22]]
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP2]], i32 [[TMP24]]
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i128 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], [[_MSPROP15]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[_MSPROP15]] to i128
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i128 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP26]], i32 [[TMP29]]
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i128 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP10]], i32 [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSPROP19]] to i128
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i128 [[TMP36]], 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP38]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -203,9 +514,19 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 }
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1
@@ -219,15 +540,42 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 12
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 [[V]], i32 12
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 13
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP2]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 [[V]], i32 13
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 15
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP2]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i8> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <16 x i8> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP16]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = or <16 x i8> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i128 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP10]], i32 [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = or <16 x i8> [[_MSPROP6]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = shufflevector <16 x i8> [[_MSPROP5]], <16 x i8> [[_MSPROP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i8> [[_MSPROP7]] to i128
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i128 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP25]], i32 [[TMP22]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP28]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
@@ -254,31 +602,90 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP2]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP2]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP2]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP2]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP2]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP2]], i32 [[TMP22]]
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP2]], i32 [[TMP24]]
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i128 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i128 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP10]], i32 [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <16 x i8> [[_MSPROP15]] to i128
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i128 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP26]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x i8> [[_MSPROP19]] to i128
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i128 [[TMP36]], 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP29]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP38]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -305,31 +712,94 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP2]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP2]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP2]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP2]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP2]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP2]], i32 [[TMP22]]
 ; CHECK-NEXT:    [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP2]], i32 [[TMP24]]
 ; CHECK-NEXT:    [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8
 ; CHECK-NEXT:    [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9
 ; CHECK-NEXT:    [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10
 ; CHECK-NEXT:    [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11
 ; CHECK-NEXT:    [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12
 ; CHECK-NEXT:    [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13
 ; CHECK-NEXT:    [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[TMP1]], i32 14
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP2]], i32 [[TMP26]]
 ; CHECK-NEXT:    [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[TMP1]], i32 15
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP2]], i32 [[TMP28]]
 ; CHECK-NEXT:    [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = or <16 x i8> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i128 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP6]], i32 [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <16 x i8> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i128 [[TMP34]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP10]], i32 [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i8> [[_MSPROP15]] to i128
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i128 [[TMP37]], 0
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP30]], i32 [[TMP36]]
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]])
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i8> [[_MSPROP19]] to i128
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne i128 [[TMP40]], 0
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 [[TMP33]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP42]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
@@ -356,15 +826,37 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP8]], i32 [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -375,15 +867,37 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP8]], i32 [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -394,15 +908,37 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8>
 
 
 
-define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP8]], i32 [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer
 ; CHECK-NEXT:    [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[S]]
 ;
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -411,19 +947,46 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8>
   ret <16 x i8> %s
 }
 
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
 
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx1_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -433,23 +996,73 @@ define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP6]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP12]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx2_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to i64
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -459,23 +1072,88 @@ define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx2_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP6]], i32 [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP8]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx3_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 56) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP9]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP23]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -485,23 +1163,103 @@ define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx3_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP6]], i32 [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP8]], i32 [[TMP16]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP10]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP22]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %out
 }
 
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbx4_8b
-; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) {
+; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 56) to ptr), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 72) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       21:
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP22]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP25]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP28]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK:       29:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP12]]) #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       30:
 ; CHECK-NEXT:    [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]])
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
@@ -511,24 +1269,56 @@ define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
   ret <8 x i8> %out
 }
 
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @tbx4_16b
-; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) {
+; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 80) to ptr), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP4]], i32 [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP6]], i32 [[TMP15]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i8> [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP8]], i32 [[TMP18]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i128 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP10]], i32 [[TMP21]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i8> [[_MSPROP3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i8> [[TMP11]] to i128
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i128 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP12]], i32 [[TMP24]]
 ; CHECK-NEXT:    [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]])
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP27]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <16 x i8> [[OUT]]
 ;
   %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
   ret <16 x i8> %out
 }
 
-declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone