[llvm-branch-commits] [llvm] ae6e893 - Precommit tests that have poison as shufflevector's placeholder

Tue Dec 29 00:14:36 PST 2020

Author: Juneyoung Lee
Date: 2020-12-29T17:09:31+09:00
New Revision: ae6e89327b04a94b6d1a2533c598ec6be60eb922

URL: https://github.com/llvm/llvm-project/commit/ae6e89327b04a94b6d1a2533c598ec6be60eb922
DIFF: https://github.com/llvm/llvm-project/commit/ae6e89327b04a94b6d1a2533c598ec6be60eb922.diff

LOG: Precommit tests that have poison as shufflevector's placeholder

This commit copies existing tests at llvm/Transforms containing
'shufflevector X, undef' and replaces them with 'shufflevector X, poison'.
The new copied tests have *-inseltpoison.ll suffix at its file name
(as db7a2f347f132b3920415013d62d1adfb18d8d58 did)
See https://reviews.llvm.org/D93793

Test files listed using

grep -R -E "^[^;]*shufflevector <.*> .*, <.*> undef" | cut -d":" -f1 | uniq

Test files copied & updated using

file_org=llvm/test/Transforms/$1
if [[ "$file_org" = *-inseltpoison.ll ]]; then
  file=$file_org
else
  file=${file_org%.ll}-inseltpoison.ll
  if [ ! -f $file ]; then
    cp $file_org $file
  fi
fi
sed -i -E 's/^([^;]*)shufflevector <(.*)> (.*), <(.*)> undef/\1shufflevector <\2> \3, <\4> poison/g' $file
head -1 $file | grep "Assertions have been autogenerated by utils/update_test_checks.py" -q
if [ "$?" == 1 ]; then
  echo "$file : should be manually updated"
  # The test is manually updated
  exit 1
fi
python3 ./llvm/utils/update_test_checks.py --opt-binary=./build-releaseassert/bin/opt $file

Added: 
    llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
    llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
    llvm/test/Transforms/Inline/inlined-loop-metadata-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/shufflemask-undef-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-avx2-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-muldq-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-pshufb-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-vpermil-inseltpoison.ll
    llvm/test/Transforms/InstCombine/assume-inseltpoison.ll
    llvm/test/Transforms/InstCombine/bswap-inseltpoison.ll
    llvm/test/Transforms/InstCombine/fmul-inseltpoison.ll
    llvm/test/Transforms/InstCombine/icmp-vec-inseltpoison.ll
    llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
    llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
    llvm/test/Transforms/InstCombine/nsw-inseltpoison.ll
    llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
    llvm/test/Transforms/InstCombine/pr2645-0-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shuffle-cast-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shufflevec-constant-inseltpoison.ll
    llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll
    llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
    llvm/test/Transforms/InstCombine/type_pun-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec-binop-select-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vector-concat-binop-inseltpoison.ll
    llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
    llvm/test/Transforms/InstSimplify/shufflevector-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-extract-user-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-extract-user-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad-inseltpoison.ll
    llvm/test/Transforms/InterleavedAccess/X86/interleavedStore-inseltpoison.ll
    llvm/test/Transforms/LoopSimplify/do-preheader-dbg-inseltpoison.ll
    llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
    llvm/test/Transforms/LoopUnroll/X86/pr46430-inseltpoison.ll
    llvm/test/Transforms/PhaseOrdering/X86/shuffle-inseltpoison.ll
    llvm/test/Transforms/Scalarizer/phi-bug-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/AArch64/vscale-bitcast-shuffle-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/no-sse-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll

Modified: 
    llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
    llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
    llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
    llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
    llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
    llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
    llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
    llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
    llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
    llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
    llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
    llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
    llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
    llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
    llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
    llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
    llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
    llvm/test/Transforms/Scalarizer/order-bug-inseltpoison.ll
    llvm/test/Transforms/SpeculativeExecution/spec-other-inseltpoison.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
index 5611ac7068ba..d34081674966 100644

--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
@@ -14,7 +14,7 @@ define <vscale x 4 x i32> @splat_base(i32* %base, <vscale x 4 x i64> %index, <vs
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i32*> poison, i32* %base, i32 0
-  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
   %gep = getelementptr i32, <vscale x 4 x i32*> %broadcast.splat, <vscale x 4 x i64> %index
   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %res
@@ -40,7 +40,7 @@ define <vscale x 4 x i32> @scalar_index(i32* %base, i64 %index, <vscale x 4 x i1
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i32*> poison, i32* %base, i32 0
-  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
   %gep = getelementptr i32, <vscale x 4 x i32*> %broadcast.splat, i64 %index
   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %res
@@ -54,7 +54,7 @@ define <vscale x 4 x i32> @splat_index(i32* %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i64> poison, i64 %index, i32 0
-  %broadcast.splat = shufflevector <vscale x 4 x i64> %broadcast.splatinsert, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <vscale x 4 x i64> %broadcast.splatinsert, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %gep = getelementptr i32, i32* %base, <vscale x 4 x i64> %broadcast.splat
   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %res
@@ -77,7 +77,7 @@ define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = insertelement <vscale x 4 x %struct.a*> poison, %struct.a* @c, i32 0
-  %2 = shufflevector <vscale x 4 x %struct.a*> %1, <vscale x 4 x %struct.a*> undef, <vscale x 4 x i32> zeroinitializer
+  %2 = shufflevector <vscale x 4 x %struct.a*> %1, <vscale x 4 x %struct.a*> poison, <vscale x 4 x i32> zeroinitializer
   %3 = getelementptr %struct.a, <vscale x 4 x %struct.a*> %2, <vscale x 4 x i64> zeroinitializer, i32 1
   %4 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %3, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %4
@@ -90,7 +90,7 @@ define <vscale x 4 x i32> @splat_ptr_gather(i32* %ptr, <vscale x 4 x i1> %mask,
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = insertelement <vscale x 4 x i32*> poison, i32* %ptr, i32 0
-  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
   %3 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %2, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru)
   ret <vscale x 4 x i32> %3
 }
@@ -102,7 +102,7 @@ define void @splat_ptr_scatter(i32* %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <vscale x 4 x i32*> poison, i32* %ptr, i32 0
-  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
   call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %val, <vscale x 4 x i32*> %2, i32 4, <vscale x 4 x i1> %mask)
   ret void
 }

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
new file mode 100644
index 000000000000..863cbc0e666c
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -codegenprepare -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown"
+
+define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @sink_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
+;
+entry:
+  %za = zext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = zext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = zext <8 x i8> %b to <8 x i16>
+  %res.2 = sub <8 x i16> %za, %zb.2
+  ret <8 x i16> %res.2
+}
+
+define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @sink_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
+;
+entry:
+  %za = sext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = sext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = sext <8 x i8> %b to <8 x i16>
+  %res.2 = sub <8 x i16> %za, %zb.2
+  ret <8 x i16> %res.2
+}
+
+define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @do_not_sink_nonfree_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[ZB_2]]
+;
+entry:
+  %za = sext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = sext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %zb.2
+}
+
+define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @do_not_sink_nonfree_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[ZB_2]]
+;
+entry:
+  %za = sext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = sext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %zb.2
+}
+
+; The masks used are suitable for umull, sink shufflevector to users.
+define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_umull(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  ret <8 x i16> %vmull0
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  ret <8 x i16> %vmull1
+}
+
+; Both exts and their shufflevector operands can be sunk.
+define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_ext_subadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]]
+; CHECK-NEXT:    ret <8 x i16> [[RES1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]]
+; CHECK-NEXT:    ret <8 x i16> [[RES2]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z1 = zext <8 x i8> %s1 to <8 x i16>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z3 = sext <8 x i8> %s3 to <8 x i16>
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z2 = zext <8 x i8> %s2 to <8 x i16>
+  %res1 = add <8 x i16> %z1, %z2
+  ret <8 x i16> %res1
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z4 = sext <8 x i8> %s4 to <8 x i16>
+  %res2 = sub <8 x i16> %z3, %z4
+  ret <8 x i16> %res2
+}
+
+
+declare void @user1(<8 x i16>)
+
+; Both exts and their shufflevector operands can be sunk.
+define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_ext_subadd_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z3:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; CHECK-NEXT:    call void @user1(<8 x i16> [[Z3]])
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]]
+; CHECK-NEXT:    ret <8 x i16> [[RES1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]]
+; CHECK-NEXT:    ret <8 x i16> [[RES2]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z1 = zext <8 x i8> %s1 to <8 x i16>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z3 = sext <8 x i8> %s3 to <8 x i16>
+  call void @user1(<8 x i16> %z3)
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z2 = zext <8 x i8> %s2 to <8 x i16>
+  %res1 = add <8 x i16> %z1, %z2
+  ret <8 x i16> %res1
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z4 = sext <8 x i8> %s4 to <8 x i16>
+  %res2 = sub <8 x i16> %z3, %z4
+  ret <8 x i16> %res2
+}
+
+
+; The masks used are not suitable for umull, do not sink.
+define <8 x i16> @no_sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @no_sink_shufflevector_umull(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S1]], <8 x i8> [[S2]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 10, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S3]], <8 x i8> [[S4]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  ret <8 x i16> %vmull0
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  ret <8 x i16> %vmull1
+}
+
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2

diff  --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
index f43b8f6d540d..9e925997b98f 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
@@ -4,10 +4,10 @@ define void @sink_add_mul(i32* %s1, i32 %x, i32* %d, i32 %n) {
 ; CHECK-LABEL: @sink_add_mul(
 ; CHECK:    vector.ph:
 ; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
-; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:    vector.body:
 ; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -16,7 +16,7 @@ entry:
 vector.ph:                                        ; preds = %for.body.preheader
   %n.vec = and i32 %n, -4
   %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -43,13 +43,13 @@ define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2
 ; CHECK-LABEL: @sink_add_mul_multiple(
 ; CHECK:    vector.ph:
 ; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
-; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:    vector.body:
 ; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %x, i32 0
-; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:      mul nsw <4 x i32> %wide.load, [[TMP3]]
 ; CHECK:      [[TMP2b:%.*]] = insertelement <4 x i32> poison, i32 %x, i32 0
-; CHECK:      [[TMP3b:%.*]] = shufflevector <4 x i32> [[TMP2b]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      [[TMP3b:%.*]] = shufflevector <4 x i32> [[TMP2b]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:      mul nsw <4 x i32> %wide.load18, [[TMP3b]]
 ;
 entry:
@@ -59,7 +59,7 @@ entry:
 vector.ph:                                        ; preds = %for.body.preheader
   %n.vec = and i32 %n, -4
   %broadcast.splatinsert15 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -101,7 +101,7 @@ define void @sink_add_sub_unsinkable(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ;
 entry:
@@ -111,7 +111,7 @@ entry:
 vector.ph:                                        ; preds = %for.body.preheader
   %n.vec = and i32 %n, -4
   %broadcast.splatinsert15 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -148,10 +148,10 @@ define void @sink_sub(i32* %s1, i32 %x, i32* %d, i32 %n) {
 ; CHECK-LABEL: @sink_sub(
 ; CHECK:    vector.ph:
 ; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK:    vector.body:
 ; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -160,7 +160,7 @@ entry:
 vector.ph:                                        ; preds = %for.body.preheader
   %n.vec = and i32 %n, -4
   %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -186,11 +186,11 @@ entry:
 ; CHECK:      vector.ph:
 ; CHECK-NEXT:   [[N_VEC:%.*]] = and i32 [[N]], -4
 ; CHECK-NEXT:   [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:   [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:   [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:   br label [[VECTOR_BODY:%.*]]
 ; CHECK:      vector.body:
 ; CHECK-NOT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NOT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NOT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ;
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
@@ -198,7 +198,7 @@ entry:
 vector.ph:                                        ; preds = %for.body.preheader
   %n.vec = and i32 %n, -4
   %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph

diff  --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
new file mode 100644
index 000000000000..fbaf7bfd85ed
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=armv7-apple-darwin < %s -codegenprepare -S | FileCheck -check-prefix=NEON %s
+; RUN: opt -mtriple=armv6-unknown-linux < %s -codegenprepare -S | FileCheck -check-prefix=NONEON %s
+
+define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; NEON-LABEL: @sink_zext(
+; NEON-NEXT:  entry:
+; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NEON:       if.then:
+; NEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
+; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; NEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NEON:       if.else:
+; NEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+; NEON-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; NEON-NEXT:    ret <8 x i16> [[RES_2]]
+;
+; NONEON-LABEL: @sink_zext(
+; NONEON-NEXT:  entry:
+; NONEON-NEXT:    [[ZA:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NONEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NONEON:       if.then:
+; NONEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
+; NONEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[ZB_1]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NONEON:       if.else:
+; NONEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+; NONEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[ZA]], [[ZB_2]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_2]]
+;
+entry:
+  %za = zext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = zext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = zext <8 x i8> %b to <8 x i16>
+  %res.2 = sub <8 x i16> %za, %zb.2
+  ret <8 x i16> %res.2
+}
+
+define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; NEON-LABEL: @sink_sext(
+; NEON-NEXT:  entry:
+; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NEON:       if.then:
+; NEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; NEON-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; NEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NEON:       if.else:
+; NEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; NEON-NEXT:    ret <8 x i16> [[RES_2]]
+;
+; NONEON-LABEL: @sink_sext(
+; NONEON-NEXT:  entry:
+; NONEON-NEXT:    [[ZA:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NONEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NONEON:       if.then:
+; NONEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; NONEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[ZB_1]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NONEON:       if.else:
+; NONEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; NONEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[ZA]], [[ZB_2]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_2]]
+;
+entry:
+  %za = sext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %zb.1 = sext <8 x i8> %b to <8 x i16>
+  %res.1 = add <8 x i16> %za, %zb.1
+  ret <8 x i16> %res.1
+
+if.else:
+  %zb.2 = sext <8 x i8> %b to <8 x i16>
+  %res.2 = sub <8 x i16> %za, %zb.2
+  ret <8 x i16> %res.2
+}
+
+define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i16> %b, i1 %c) {
+;
+; NEON-LABEL: @do_not_sink_nonfree_zext(
+; NEON-NEXT:  entry:
+; NEON-NEXT:    [[ZA:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NEON:       if.then:
+; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[B:%.*]]
+; NEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NEON:       if.else:
+; NEON-NEXT:    ret <8 x i16> [[B]]
+;
+; NONEON-LABEL: @do_not_sink_nonfree_zext(
+; NONEON-NEXT:  entry:
+; NONEON-NEXT:    [[ZA:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NONEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NONEON:       if.then:
+; NONEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[B:%.*]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NONEON:       if.else:
+; NONEON-NEXT:    ret <8 x i16> [[B]]
+;
+entry:
+  %za = zext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %res.1 = add <8 x i16> %za, %b
+  ret <8 x i16> %res.1
+
+if.else:
+  ret <8 x i16> %b
+}
+
+define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i16> %b, i1 %c) {
+; CHECK-LABEL: @do_not_sink_nonfree_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[ZB_2]]
+;
+; NEON-LABEL: @do_not_sink_nonfree_sext(
+; NEON-NEXT:  entry:
+; NEON-NEXT:    [[ZA:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NEON:       if.then:
+; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[B:%.*]]
+; NEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NEON:       if.else:
+; NEON-NEXT:    ret <8 x i16> [[B]]
+;
+; NONEON-LABEL: @do_not_sink_nonfree_sext(
+; NONEON-NEXT:  entry:
+; NONEON-NEXT:    [[ZA:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NONEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NONEON:       if.then:
+; NONEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[ZA]], [[B:%.*]]
+; NONEON-NEXT:    ret <8 x i16> [[RES_1]]
+; NONEON:       if.else:
+; NONEON-NEXT:    ret <8 x i16> [[B]]
+;
+entry:
+  %za = sext <8 x i8> %a to <8 x i16>
+  br i1 %c, label %if.then, label %if.else
+
+if.then:
+  %res.1 = add <8 x i16> %za, %b
+  ret <8 x i16> %res.1
+
+if.else:
+  ret <8 x i16> %b
+}
+
+declare void @user1(<8 x i16>)
+
+; Exts can be sunk.
+define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> %b) {
+; NEON-LABEL: @sink_shufflevector_ext_subadd_multiuse(
+; NEON-NEXT:  entry:
+; NEON-NEXT:    [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:    [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:    [[Z3:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; NEON-NEXT:    call void @user1(<8 x i16> [[Z3]])
+; NEON-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NEON:       if.then:
+; NEON-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
+; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; NEON-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]]
+; NEON-NEXT:    ret <8 x i16> [[RES1]]
+; NEON:       if.else:
+; NEON-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
+; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; NEON-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]]
+; NEON-NEXT:    ret <8 x i16> [[RES2]]
+;
+; NONEON-LABEL: @sink_shufflevector_ext_subadd_multiuse(
+; NONEON-NEXT:  entry:
+; NONEON-NEXT:    [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NONEON-NEXT:    [[Z1:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; NONEON-NEXT:    [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NONEON-NEXT:    [[Z3:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; NONEON-NEXT:    call void @user1(<8 x i16> [[Z3]])
+; NONEON-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; NONEON:       if.then:
+; NONEON-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NONEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
+; NONEON-NEXT:    [[RES1:%.*]] = add <8 x i16> [[Z1]], [[Z2]]
+; NONEON-NEXT:    ret <8 x i16> [[RES1]]
+; NONEON:       if.else:
+; NONEON-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NONEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
+; NONEON-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[Z3]], [[Z4]]
+; NONEON-NEXT:    ret <8 x i16> [[RES2]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z1 = zext <8 x i8> %s1 to <8 x i16>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z3 = sext <8 x i8> %s3 to <8 x i16>
+  call void @user1(<8 x i16> %z3)
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %z2 = zext <8 x i8> %s2 to <8 x i16>
+  %res1 = add <8 x i16> %z1, %z2
+  ret <8 x i16> %res1
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %z4 = sext <8 x i8> %s4 to <8 x i16>
+  %res2 = sub <8 x i16> %z3, %z4
+  ret <8 x i16> %res2
+}

diff  --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
index 7cffedea8a35..9930b032550d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
@@ -15,7 +15,7 @@ define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
 ; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
 ; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
@@ -30,7 +30,7 @@ entry:
   %n.vec = and i32 %n, -8
   %l0 = trunc i16 %x to i8
   %l1 = insertelement <8 x i8> poison, i8 %l0, i32 0
-  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> poison, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %entry
@@ -58,7 +58,7 @@ define signext i8 @alive(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N:%.*]], -8
 ; CHECK-NEXT:    [[L0:%.*]] = trunc i16 [[X:%.*]] to i8
 ; CHECK-NEXT:    [[L1:%.*]] = insertelement <8 x i8> poison, i8 [[L0]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[L1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[L1]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[L2:%.*]] = sub <8 x i8> zeroinitializer, [[BROADCAST_SPLAT26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -69,7 +69,7 @@ define signext i8 @alive(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
 ; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
 ; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
@@ -84,7 +84,7 @@ entry:
   %n.vec = and i32 %n, -8
   %l0 = trunc i16 %x to i8
   %l1 = insertelement <8 x i8> poison, i8 %l0, i32 0
-  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> poison, <8 x i32> zeroinitializer
   %l2 = sub <8 x i8> zeroinitializer, %broadcast.splat26
   br label %vector.body
 

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
new file mode 100644
index 000000000000..9eede8cf361b
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cgp_shuffle_crash-inseltpoison.ll
@@ -0,0 +1,14 @@
+; RUN: opt -codegenprepare -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: shuffle_one_source
+
+define <2 x i8> @shuffle_one_source(i32 %x) {
+  %Shuf = shufflevector <2 x i8> zeroinitializer, <2 x i8> zeroinitializer, <2 x i32> poison
+  %Cmp = icmp slt i32 480483, %x
+  %B = mul <2 x i8> %Shuf, %Shuf
+  %S = select i1 %Cmp, <2 x i8> %B, <2 x i8> zeroinitializer
+  ret <2 x i8> %Shuf
+}
+

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
index 88967ac1ef7c..a1fd88e4b314 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -16,7 +16,7 @@ define <4 x i32> @splat_base(i32* %base, <4 x i64> %index) {
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i32*> poison, i32* %base, i32 0
-  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> poison, <4 x i32> zeroinitializer
   %gep = getelementptr i32, <4 x i32*> %broadcast.splat, <4 x i64> %index
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %res
@@ -42,7 +42,7 @@ define <4 x i32> @scalar_index(i32* %base, i64 %index) {
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i32*> poison, i32* %base, i32 0
-  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> poison, <4 x i32> zeroinitializer
   %gep = getelementptr i32, <4 x i32*> %broadcast.splat, i64 %index
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %res
@@ -56,7 +56,7 @@ define <4 x i32> @splat_index(i32* %base, i64 %index) {
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
   %gep = getelementptr i32, i32* %base, <4 x i64> %broadcast.splat
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %res
@@ -79,7 +79,7 @@ define <4 x i32> @global_struct_splat() {
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x %struct.a*> poison, %struct.a* @c, i32 0
-  %2 = shufflevector <4 x %struct.a*> %1, <4 x %struct.a*> undef, <4 x i32> zeroinitializer
+  %2 = shufflevector <4 x %struct.a*> %1, <4 x %struct.a*> poison, <4 x i32> zeroinitializer
   %3 = getelementptr %struct.a, <4 x %struct.a*> %2, <4 x i64> zeroinitializer, i32 1
   %4 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %4
@@ -92,7 +92,7 @@ define <4 x i32> @splat_ptr_gather(i32* %ptr, <4 x i1> %mask, <4 x i32> %passthr
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = insertelement <4 x i32*> poison, i32* %ptr, i32 0
-  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %2 = shufflevector <4 x i32*> %1, <4 x i32*> poison, <4 x i32> zeroinitializer
   %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
   ret <4 x i32> %3
 }
@@ -104,7 +104,7 @@ define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <4 x i32*> poison, i32* %ptr, i32 0
-  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %2 = shufflevector <4 x i32*> %1, <4 x i32*> poison, <4 x i32> zeroinitializer
   call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %2, i32 4, <4 x i1> %mask)
   ret void
 }

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
index 1d26beeb236e..1c9617ab0d44 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
@@ -8,8 +8,8 @@
 
 define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; AVX1-LABEL: @vector_variable_shift_right_v4i32(
-; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 ; AVX1-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SPLAT1]]
 ; AVX1-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Z]], [[SPLAT2]]
@@ -17,28 +17,29 @@ define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x
 ; AVX1-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX2-LABEL: @vector_variable_shift_right_v4i32(
-; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 ; AVX2-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
 ; AVX2-NEXT:    ret <4 x i32> [[SH]]
 ;
 ; AVX512BW-LABEL: @vector_variable_shift_right_v4i32(
-; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX512BW-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 ; AVX512BW-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
 ; AVX512BW-NEXT:    ret <4 x i32> [[SH]]
 ;
 ; XOP-LABEL: @vector_variable_shift_right_v4i32(
-; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; XOP-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 ; XOP-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
 ; XOP-NEXT:    ret <4 x i32> [[SH]]
 ;
-  %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
-  %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  %splat1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %splat2 = shufflevector <4 x i32> %y, <4 x i32> poison, <4 x i32> zeroinitializer
   %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2
   %sh = lshr <4 x i32> %z, %sel
   ret <4 x i32> %sh
@@ -46,8 +47,8 @@ define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x
 
 define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
 ; AVX1-LABEL: @vector_variable_shift_right_v16i16(
-; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
 ; AVX1-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
 ; AVX1-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
@@ -55,8 +56,8 @@ define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16
 ; AVX1-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX2-LABEL: @vector_variable_shift_right_v16i16(
-; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
 ; AVX2-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
@@ -64,21 +65,22 @@ define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16
 ; AVX2-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX512BW-LABEL: @vector_variable_shift_right_v16i16(
-; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
 ; AVX512BW-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
 ; AVX512BW-NEXT:    ret <16 x i16> [[SH]]
 ;
 ; XOP-LABEL: @vector_variable_shift_right_v16i16(
-; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOP-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
 ; XOP-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
 ; XOP-NEXT:    ret <16 x i16> [[SH]]
 ;
-  %splat1 = shufflevector <16 x i16> %x, <16 x i16> undef, <16 x i32> zeroinitializer
-  %splat2 = shufflevector <16 x i16> %y, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  %splat1 = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> zeroinitializer
+  %splat2 = shufflevector <16 x i16> %y, <16 x i16> poison, <16 x i32> zeroinitializer
   %sel = select <16 x i1> %cond, <16 x i16> %splat1, <16 x i16> %splat2
   %sh = lshr <16 x i16> %z, %sel
   ret <16 x i16> %sh
@@ -86,14 +88,15 @@ define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16
 
 define <32 x i8> @vector_variable_shift_right_v32i8(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
 ; ALL-LABEL: @vector_variable_shift_right_v32i8(
-; ALL-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
-; ALL-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; ALL-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
+; ALL-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; ALL-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
 ; ALL-NEXT:    [[SH:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SEL]]
 ; ALL-NEXT:    ret <32 x i8> [[SH]]
 ;
-  %splat1 = shufflevector <32 x i8> %x, <32 x i8> undef, <32 x i32> zeroinitializer
-  %splat2 = shufflevector <32 x i8> %y, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  %splat1 = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> zeroinitializer
+  %splat2 = shufflevector <32 x i8> %y, <32 x i8> poison, <32 x i32> zeroinitializer
   %sel = select <32 x i1> %cond, <32 x i8> %splat1, <32 x i8> %splat2
   %sh = lshr <32 x i8> %z, %sel
   ret <32 x i8> %sh
@@ -110,11 +113,11 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX1:       vector.ph:
 ; AVX1-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; AVX1-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
-; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
-; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; AVX1-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -123,9 +126,9 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX1-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP4]]
-; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP6]]
 ; AVX1-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP5]], <4 x i32> [[TMP7]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
@@ -145,11 +148,11 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX2:       vector.ph:
 ; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; AVX2-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
-; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
-; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; AVX2-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -176,11 +179,11 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX512BW:       vector.ph:
 ; AVX512BW-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; AVX512BW-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
-; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX512BW-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
-; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX512BW-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; AVX512BW-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; AVX512BW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512BW:       vector.body:
 ; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -207,11 +210,11 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; XOP:       vector.ph:
 ; XOP-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; XOP-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
-; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; XOP-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
-; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; XOP-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; XOP-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; XOP:       vector.body:
 ; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -230,6 +233,7 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; XOP:       exit:
 ; XOP-NEXT:    ret void
 ;
+
 entry:
   %cmp16 = icmp sgt i32 %count, 0
   %wide.trip.count = zext i32 %count to i64
@@ -238,11 +242,11 @@ entry:
 vector.ph:
   %n.vec = and i64 %wide.trip.count, 4294967292
   %splatinsert18 = insertelement <4 x i32> poison, i32 %amt0, i32 0
-  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> poison, <4 x i32> zeroinitializer
   %splatinsert20 = insertelement <4 x i32> poison, i32 %amt1, i32 0
-  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> poison, <4 x i32> zeroinitializer
   %splatinsert22 = insertelement <4 x i32> poison, i32 %x, i32 0
-  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:
@@ -272,9 +276,9 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; AVX1-LABEL: @fancierRotate2(
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
-; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
-; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    br label [[LOOP:%.*]]
 ; AVX1:       loop:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
@@ -286,9 +290,9 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; AVX1-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
 ; AVX1-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
 ; AVX1-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
-; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
 ; AVX1-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
 ; AVX1-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[T5]], align 4
@@ -301,9 +305,9 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; AVX2-LABEL: @fancierRotate2(
 ; AVX2-NEXT:  entry:
 ; AVX2-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
-; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
-; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    br label [[LOOP:%.*]]
 ; AVX2:       loop:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
@@ -326,9 +330,9 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; AVX512BW-LABEL: @fancierRotate2(
 ; AVX512BW-NEXT:  entry:
 ; AVX512BW-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
-; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512BW-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
-; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512BW-NEXT:    br label [[LOOP:%.*]]
 ; AVX512BW:       loop:
 ; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
@@ -351,9 +355,9 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; XOP-LABEL: @fancierRotate2(
 ; XOP-NEXT:  entry:
 ; XOP-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
-; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOP-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
-; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOP-NEXT:    br label [[LOOP:%.*]]
 ; XOP:       loop:
 ; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
@@ -373,11 +377,12 @@ define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
 ; XOP:       exit:
 ; XOP-NEXT:    ret void
 ;
+
 entry:
   %i0 = insertelement <8 x i32> poison, i32 %rot0, i32 0
-  %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %s0 = shufflevector <8 x i32> %i0, <8 x i32> poison, <8 x i32> zeroinitializer
   %i1 = insertelement <8 x i32> poison, i32 %rot1, i32 0
-  %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
+  %s1 = shufflevector <8 x i32> %i1, <8 x i32> poison, <8 x i32> zeroinitializer
   br label %loop
 
 loop:

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
index 4e9f09fa32bc..be3723b7d554 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10.9.0"
 
 define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
 ; CHECK-LABEL: @test_8bit(
-; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if_true:
 ; CHECK-NEXT:    ret <16 x i8> [[MASK]]
@@ -17,7 +17,7 @@ define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
 ; CHECK-NEXT:    [[RES:%.*]] = shl <16 x i8> [[LHS:%.*]], [[MASK]]
 ; CHECK-NEXT:    ret <16 x i8> [[RES]]
 ;
-  %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer
+  %mask = shufflevector <16 x i8> %tmp, <16 x i8> poison, <16 x i32> zeroinitializer
   br i1 %tst, label %if_true, label %if_false
 
 if_true:
@@ -30,17 +30,17 @@ if_false:
 
 define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
 ; CHECK-SSE2-LABEL: @test_16bit(
-; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-SSE2:       if_true:
 ; CHECK-SSE2-NEXT:    ret <8 x i16> [[MASK]]
 ; CHECK-SSE2:       if_false:
-; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-SSE2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
 ; CHECK-SSE2-NEXT:    ret <8 x i16> [[RES]]
 ;
 ; CHECK-XOP-LABEL: @test_16bit(
-; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-XOP:       if_true:
 ; CHECK-XOP-NEXT:    ret <8 x i16> [[MASK]]
@@ -49,17 +49,17 @@ define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
 ; CHECK-XOP-NEXT:    ret <8 x i16> [[RES]]
 ;
 ; CHECK-AVX2-LABEL: @test_16bit(
-; CHECK-AVX2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-AVX2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-AVX2:       if_true:
 ; CHECK-AVX2-NEXT:    ret <8 x i16> [[MASK]]
 ; CHECK-AVX2:       if_false:
-; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-AVX2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
 ; CHECK-AVX2-NEXT:    ret <8 x i16> [[RES]]
 ;
 ; CHECK-AVX512BW-LABEL: @test_16bit(
-; CHECK-AVX512BW-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX512BW-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-AVX512BW-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-AVX512BW:       if_true:
 ; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[MASK]]
@@ -67,7 +67,7 @@ define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
 ; CHECK-AVX512BW-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[MASK]]
 ; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[RES]]
 ;
-  %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mask = shufflevector <8 x i16> %tmp, <8 x i16> poison, <8 x i32> zeroinitializer
   br i1 %tst, label %if_true, label %if_false
 
 if_true:
@@ -80,7 +80,7 @@ if_false:
 
 define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
 ; CHECK-LABEL: @test_notsplat(
-; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if_true:
 ; CHECK-NEXT:    ret <4 x i32> [[MASK]]
@@ -88,7 +88,7 @@ define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
 ; CHECK-NEXT:    [[RES:%.*]] = shl <4 x i32> [[LHS:%.*]], [[MASK]]
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
   br i1 %tst, label %if_true, label %if_false
 
 if_true:
@@ -101,17 +101,17 @@ if_false:
 
 define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
 ; CHECK-SSE2-LABEL: @test_32bit(
-; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
 ; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-SSE2:       if_true:
 ; CHECK-SSE2-NEXT:    ret <4 x i32> [[MASK]]
 ; CHECK-SSE2:       if_false:
-; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
 ; CHECK-SSE2-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[TMP1]]
 ; CHECK-SSE2-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CHECK-XOP-LABEL: @test_32bit(
-; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
 ; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-XOP:       if_true:
 ; CHECK-XOP-NEXT:    ret <4 x i32> [[MASK]]
@@ -120,7 +120,7 @@ define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
 ; CHECK-XOP-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CHECK-AVX-LABEL: @test_32bit(
-; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
 ; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-AVX:       if_true:
 ; CHECK-AVX-NEXT:    ret <4 x i32> [[MASK]]
@@ -128,7 +128,7 @@ define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
 ; CHECK-AVX-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[MASK]]
 ; CHECK-AVX-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
   br i1 %tst, label %if_true, label %if_false
 
 if_true:
@@ -141,17 +141,17 @@ if_false:
 
 define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
 ; CHECK-SSE2-LABEL: @test_64bit(
-; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-SSE2:       if_true:
 ; CHECK-SSE2-NEXT:    ret <2 x i64> [[MASK]]
 ; CHECK-SSE2:       if_false:
-; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-SSE2-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[TMP1]]
 ; CHECK-SSE2-NEXT:    ret <2 x i64> [[RES]]
 ;
 ; CHECK-XOP-LABEL: @test_64bit(
-; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-XOP:       if_true:
 ; CHECK-XOP-NEXT:    ret <2 x i64> [[MASK]]
@@ -160,7 +160,7 @@ define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
 ; CHECK-XOP-NEXT:    ret <2 x i64> [[RES]]
 ;
 ; CHECK-AVX-LABEL: @test_64bit(
-; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK-AVX:       if_true:
 ; CHECK-AVX-NEXT:    ret <2 x i64> [[MASK]]
@@ -168,7 +168,7 @@ define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
 ; CHECK-AVX-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[MASK]]
 ; CHECK-AVX-NEXT:    ret <2 x i64> [[RES]]
 ;
-  %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+  %mask = shufflevector <2 x i64> %tmp, <2 x i64> poison, <2 x i32> zeroinitializer
   br i1 %tst, label %if_true, label %if_false
 
 if_true:
@@ -189,7 +189,7 @@ define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
 ; CHECK-SSE2-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
 ; CHECK-SSE2-NEXT:    [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
 ; CHECK-SSE2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
-; CHECK-SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-SSE2-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
 ; CHECK-SSE2-NEXT:    store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
 ; CHECK-SSE2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
@@ -201,7 +201,7 @@ define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
 ; CHECK-XOP-LABEL: @funnel_splatvar(
 ; CHECK-XOP-NEXT:  entry:
 ; CHECK-XOP-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[ROT:%.*]], i32 0
-; CHECK-XOP-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-XOP:       vector.body:
 ; CHECK-XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -219,7 +219,7 @@ define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
 ; CHECK-AVX-LABEL: @funnel_splatvar(
 ; CHECK-AVX-NEXT:  entry:
 ; CHECK-AVX-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[ROT:%.*]], i32 0
-; CHECK-AVX-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-AVX-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-AVX:       vector.body:
 ; CHECK-AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -236,7 +236,7 @@ define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
 ;
 entry:
   %broadcast.splatinsert15 = insertelement <8 x i32> poison, i32 %rot, i32 0
-  %broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer
+  %broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> poison, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:

diff  --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
new file mode 100644
index 000000000000..4f3592523d05
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -tbaa -dse -enable-dse-memoryssa=false -S < %s | FileCheck %s
+; RUN: opt -tbaa -dse -enable-dse-memoryssa=true -S < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+define dllexport i32 @f0(i8** %a0, i8** %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) #0 {
+; CHECK-LABEL: @f0(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = getelementptr inbounds i8*, i8** [[A0:%.*]], i32 [[A2:%.*]]
+; CHECK-NEXT:    [[V1:%.*]] = load i8*, i8** [[V0]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[V2:%.*]] = getelementptr i8, i8* [[V1]], i32 [[A3:%.*]]
+; CHECK-NEXT:    [[V3:%.*]] = bitcast i8* [[V2]] to <128 x i8>*
+; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds i8*, i8** [[A1:%.*]], i32 [[A4:%.*]]
+; CHECK-NEXT:    [[V7:%.*]] = load i8*, i8** [[V6]], align 4, [[TBAA3:!tbaa !.*]]
+; CHECK-NEXT:    [[V8:%.*]] = getelementptr i8, i8* [[V7]], i32 [[A5:%.*]]
+; CHECK-NEXT:    [[V9:%.*]] = bitcast i8* [[V8]] to <128 x i8>*
+; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[V9]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5:!tbaa !.*]]
+; CHECK-NEXT:    [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[V17:%.*]] = getelementptr inbounds i8*, i8** [[A1]], i32 [[A6:%.*]]
+; CHECK-NEXT:    [[V18:%.*]] = load i8*, i8** [[V17]], align 4, [[TBAA3]]
+; CHECK-NEXT:    [[V19:%.*]] = getelementptr i8, i8* [[V18]], i32 [[A7:%.*]]
+; CHECK-NEXT:    [[V20:%.*]] = bitcast i8* [[V19]] to <128 x i8>*
+; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* [[V20]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5]]
+; CHECK-NEXT:    [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]]
+; CHECK-NEXT:    [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]]
+; CHECK-NEXT:    [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> [[V25]], <128 x i8>* [[V3]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), [[TBAA8:!tbaa !.*]]
+; CHECK-NEXT:    ret i32 0
+;
+b0:
+  %v0 = getelementptr inbounds i8*, i8** %a0, i32 %a2
+  %v1 = load i8*, i8** %v0, align 4, !tbaa !0
+  %v2 = getelementptr i8, i8* %v1, i32 %a3
+  %v3 = bitcast i8* %v2 to <128 x i8>*
+  tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <128 x i8>* %v3, i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa !3
+  %v6 = getelementptr inbounds i8*, i8** %a1, i32 %a4
+  %v7 = load i8*, i8** %v6, align 4, !tbaa !6
+  %v8 = getelementptr i8, i8* %v7, i32 %a5
+  %v9 = bitcast i8* %v8 to <128 x i8>*
+  %v10 = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %v9, i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa !8
+  %v11 = shufflevector <128 x i8> %v10, <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v14 = shufflevector <32 x i8> %v11, <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v14, <128 x i8>* %v3, i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa !3
+  %v16 = shufflevector <128 x i8> %v14, <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v17 = getelementptr inbounds i8*, i8** %a1, i32 %a6
+  %v18 = load i8*, i8** %v17, align 4, !tbaa !6
+  %v19 = getelementptr i8, i8* %v18, i32 %a7
+  %v20 = bitcast i8* %v19 to <128 x i8>*
+  %v21 = tail call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %v20, i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa !8
+  %v22 = shufflevector <128 x i8> %v21, <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v23 = icmp ugt <32 x i8> %v16, %v22
+  %v24 = select <32 x i1> %v23, <32 x i8> %v16, <32 x i8> %v22
+  %v25 = shufflevector <32 x i8> %v24, <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  tail call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v25, <128 x i8>* %v3, i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa !3
+  ret i32 0
+}
+
+declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #1
+declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind willreturn }
+attributes #2 = { argmemonly nounwind readonly willreturn }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"0x2cf74d0", !2, i64 0}
+!2 = !{!"tvm-tbaa"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"i8", !5, i64 0}
+!5 = !{!"0x2c6ebb0", !2, i64 0}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"0x2cff870", !2, i64 0}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"i8", !10, i64 0}
+!10 = !{!"0x2c6c3c0", !2, i64 0}

diff  --git a/llvm/test/Transforms/Inline/inlined-loop-metadata-inseltpoison.ll b/llvm/test/Transforms/Inline/inlined-loop-metadata-inseltpoison.ll
new file mode 100755
index 000000000000..60c2f42a8f35
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inlined-loop-metadata-inseltpoison.ll
@@ -0,0 +1,159 @@
+; This test checks that the !llvm.loop metadata has been updated after inlining
+; so that the start and end locations refer to the inlined DILocations.
+
+; RUN: opt -inline -always-inline %s -S 2>&1 | FileCheck %s
+; CHECK: br i1 %{{.*}}, label %middle.block.i, label %vector.body.i, !dbg !{{[0-9]+}}, !llvm.loop [[VECTOR:![0-9]+]]
+; CHECK: br i1 %{{.*}}, label %for.cond.cleanup.loopexit.i, label %for.body.i, !dbg !{{[0-9]+}}, !llvm.loop [[SCALAR:![0-9]+]]
+; CHECK-DAG: [[VECTOR]] = distinct !{[[VECTOR]], [[START:![0-9]+]], [[END:![0-9]+]], [[IS_VECTORIZED:![0-9]+]]}
+; CHECK-DAG: [[SCALAR]] = distinct !{[[SCALAR]], [[START]], [[END]], [[NO_UNROLL:![0-9]+]], [[IS_VECTORIZED]]}
+; CHECK-DAG: [[IS_VECTORIZED]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-DAG: [[NO_UNROLL]] = !{!"llvm.loop.unroll.runtime.disable"}
+
+; This IR can be generated by running:
+;   clang -emit-llvm -S -gmlt -O2 inlined.cpp -o - -mllvm -opt-bisect-limit=53 |\
+;   opt -loop-vectorize
+;
+; Where inlined.cpp contains:
+; extern int *Array;
+; static int bar(unsigned x)
+; {
+;   int Ret = 0;
+;   for (unsigned i = 0; i < x; ++i)
+;   {
+;     Ret += Array[i] * i;
+;   }
+;   return Ret;
+; }
+;
+; int foo(unsigned x)
+; {
+;   int Bar = bar(x);
+;   return Bar;
+; }
+
+@"?Array@@3PEAHEA" = external dso_local local_unnamed_addr global i32*, align 8
+
+define dso_local i32 @"?foo@@YAHI at Z"(i32 %x) local_unnamed_addr !dbg !8 {
+entry:
+  %call = call fastcc i32 @"?bar@@YAHI at Z"(i32 %x), !dbg !10
+  ret i32 %call, !dbg !11
+}
+
+define internal fastcc i32 @"?bar@@YAHI at Z"(i32 %x) unnamed_addr !dbg !12 {
+entry:
+  %cmp7 = icmp eq i32 %x, 0, !dbg !13
+  br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph, !dbg !13
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32*, i32** @"?Array@@3PEAHEA", align 8, !dbg !14, !tbaa !15
+  %wide.trip.count = zext i32 %x to i64, !dbg !14
+  %min.iters.check = icmp ult i64 %wide.trip.count, 8, !dbg !13
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !13
+
+vector.ph:                                        ; preds = %for.body.lr.ph
+  %n.mod.vf = urem i64 %wide.trip.count, 8, !dbg !13
+  %n.vec = sub i64 %wide.trip.count, %n.mod.vf, !dbg !13
+  br label %vector.body, !dbg !13
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !13
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %17, %vector.body ]
+  %vec.phi2 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %18, %vector.body ]
+  %vec.ind4 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next7, %vector.body ], !dbg !19
+  %step.add = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %1 = add i64 %index, 0, !dbg !13
+  %2 = add i64 %index, 1, !dbg !13
+  %3 = add i64 %index, 2, !dbg !13
+  %4 = add i64 %index, 3, !dbg !13
+  %5 = add i64 %index, 4, !dbg !13
+  %6 = add i64 %index, 5, !dbg !13
+  %7 = add i64 %index, 6, !dbg !13
+  %8 = add i64 %index, 7, !dbg !13
+  %9 = getelementptr inbounds i32, i32* %0, i64 %1, !dbg !19
+  %10 = getelementptr inbounds i32, i32* %0, i64 %5, !dbg !19
+  %11 = getelementptr inbounds i32, i32* %9, i32 0, !dbg !19
+  %12 = bitcast i32* %11 to <4 x i32>*, !dbg !19
+  %wide.load = load <4 x i32>, <4 x i32>* %12, align 4, !dbg !19, !tbaa !20
+  %13 = getelementptr inbounds i32, i32* %9, i32 4, !dbg !19
+  %14 = bitcast i32* %13 to <4 x i32>*, !dbg !19
+  %wide.load3 = load <4 x i32>, <4 x i32>* %14, align 4, !dbg !19, !tbaa !20
+  %step.add5 = add <4 x i32> %vec.ind4, <i32 4, i32 4, i32 4, i32 4>, !dbg !19
+  %15 = mul <4 x i32> %wide.load, %vec.ind4, !dbg !19
+  %16 = mul <4 x i32> %wide.load3, %step.add5, !dbg !19
+  %17 = add <4 x i32> %15, %vec.phi, !dbg !19
+  %18 = add <4 x i32> %16, %vec.phi2, !dbg !19
+  %index.next = add i64 %index, 8, !dbg !13
+  %vec.ind.next = add <4 x i64> %step.add, <i64 4, i64 4, i64 4, i64 4>
+  %vec.ind.next7 = add <4 x i32> %step.add5, <i32 4, i32 4, i32 4, i32 4>, !dbg !19
+  %19 = icmp eq i64 %index.next, %n.vec, !dbg !13
+  br i1 %19, label %middle.block, label %vector.body, !dbg !13, !llvm.loop !22
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <4 x i32> %18, %17, !dbg !19
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>, !dbg !19
+  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf, !dbg !19
+  %rdx.shuf9 = shufflevector <4 x i32> %bin.rdx8, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>, !dbg !19
+  %bin.rdx10 = add <4 x i32> %bin.rdx8, %rdx.shuf9, !dbg !19
+  %20 = extractelement <4 x i32> %bin.rdx10, i32 0, !dbg !19
+  %cmp.n = icmp eq i64 %wide.trip.count, %n.vec, !dbg !13
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph, !dbg !13
+
+scalar.ph:                                        ; preds = %middle.block, %for.body.lr.ph
+  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.lr.ph ]
+  %bc.merge.rdx = phi i32 [ 0, %for.body.lr.ph ], [ %20, %middle.block ]
+  br label %for.body, !dbg !13
+
+for.cond.cleanup.loopexit:                        ; preds = %middle.block, %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ], [ %20, %middle.block ], !dbg !19
+  br label %for.cond.cleanup, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Ret.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ], !dbg !14
+  ret i32 %Ret.0.lcssa, !dbg !25
+
+for.body:                                         ; preds = %for.body, %scalar.ph
+  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+  %Ret.08 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv, !dbg !19
+  %21 = load i32, i32* %arrayidx, align 4, !dbg !19, !tbaa !20
+  %22 = trunc i64 %indvars.iv to i32, !dbg !19
+  %mul = mul i32 %21, %22, !dbg !19
+  %add = add i32 %mul, %Ret.08, !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !13
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !13
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !13, !llvm.loop !26
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 9.0.0 (https://github.com/llvm/llvm-project.git b1e28d9b6a16380ccf1456fe0695f639364407a9)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "inlined.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 9.0.0 (https://github.com/llvm/llvm-project.git b1e28d9b6a16380ccf1456fe0695f639364407a9)"}
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 13, type: !9, scopeLine: 14, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 15, scope: !8)
+!11 = !DILocation(line: 16, scope: !8)
+!12 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 3, type: !9, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!13 = !DILocation(line: 6, scope: !12)
+!14 = !DILocation(line: 0, scope: !12)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"any pointer", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C++ TBAA"}
+!19 = !DILocation(line: 8, scope: !12)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !17, i64 0}
+!22 = distinct !{!22, !13, !23, !24}
+!23 = !DILocation(line: 9, scope: !12)
+!24 = !{!"llvm.loop.isvectorized", i32 1}
+!25 = !DILocation(line: 10, scope: !12)
+!26 = distinct !{!26, !13, !23, !27, !24}
+!27 = !{!"llvm.loop.unroll.runtime.disable"}

diff  --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index 8363ed461f94..a8b4601cc1c9 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -99,7 +99,7 @@ define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -109,17 +109,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inre
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -128,27 +128,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inre
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -199,7 +199,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i3
   %elt2 = extractelement <4 x float> %data, i32 2
   %ins0 = insertelement <2 x float> poison, float %elt0, i32 0
   %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 4, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 4, i32 1>
   %ret = fadd <2 x float> %ins1, %shuf
   ret <2 x float> %ret
 }
@@ -270,17 +270,17 @@ define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -325,7 +325,7 @@ define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %r
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -334,7 +334,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i3
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -480,7 +480,7 @@ define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -490,7 +490,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32>
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -500,7 +500,7 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32>
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -509,7 +509,7 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32>
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -519,17 +519,17 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -567,7 +567,7 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -577,7 +577,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32>
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -674,7 +674,7 @@ define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc,
 ; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
 }
 
@@ -737,7 +737,7 @@ define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
 }
 
@@ -837,7 +837,7 @@ define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc,
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -847,7 +847,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> in
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -857,7 +857,7 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> in
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -866,17 +866,17 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> in
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -914,7 +914,7 @@ define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc,
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -924,7 +924,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> in
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -932,11 +932,11 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> in
 ; to vec4 anyway during lowering.
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -1032,7 +1032,7 @@ define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
 }
 
@@ -1095,7 +1095,7 @@ define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32
 ; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
 }
 
@@ -1203,7 +1203,7 @@ define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inre
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -1213,17 +1213,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -1232,27 +1232,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -1290,17 +1290,17 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inre
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -1438,7 +1438,7 @@ define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %r
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -1448,7 +1448,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i3
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -1458,7 +1458,7 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i3
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -1467,7 +1467,7 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i3
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -1477,17 +1477,17 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -1525,7 +1525,7 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %r
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -1535,7 +1535,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i3
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -1632,7 +1632,7 @@ define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rs
 ; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
 }
 
@@ -1695,7 +1695,7 @@ define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc,
 ; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
 }
 
@@ -1803,7 +1803,7 @@ define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> i
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -1813,17 +1813,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -1832,27 +1832,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -1890,17 +1890,17 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> i
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -2023,7 +2023,7 @@ define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsr
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2033,17 +2033,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -2052,27 +2052,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32>
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -2110,17 +2110,17 @@ define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsr
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -2286,7 +2286,7 @@ define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2296,17 +2296,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -2315,27 +2315,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -2373,17 +2373,17 @@ define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -2505,7 +2505,7 @@ define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2515,17 +2515,17 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inr
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -2534,27 +2534,27 @@ define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inr
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
 ; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -2592,17 +2592,17 @@ define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
 ; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -2755,7 +2755,7 @@ define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float
 ; CHECK-NEXT: ret <2 x float> %1
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2764,7 +2764,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2773,7 +2773,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2782,7 +2782,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
@@ -2792,7 +2792,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32
 ; CHECK-NEXT: ret <3 x float> %1
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -2802,7 +2802,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -2812,7 +2812,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_
 ; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -2821,7 +2821,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -2830,7 +2830,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
@@ -2924,7 +2924,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
@@ -2939,7 +2939,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float,
 ; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuf
 }
 
@@ -2954,7 +2954,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i
 ; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
@@ -2997,7 +2997,7 @@ define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %d
 ; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x half> %res
 }
 
@@ -3007,17 +3007,17 @@ define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32
 ; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %res
 }
 
 ; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
 ; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
+; CHECK-NEXT: %res = insertelement <4 x half> poison, half %data, i64 0
 ; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %res
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/X86/shufflemask-undef-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/shufflemask-undef-inseltpoison.ll
new file mode 100644
index 000000000000..95b453a20698
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/shufflemask-undef-inseltpoison.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: shufflevector{{.*}}i32 8"
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+	%struct.ActiveTextureTargets = type { i64, i64, i64, i64, i64, i64 }
+	%struct.AlphaTest = type { float, i16, i8, i8 }
+	%struct.ArrayRange = type { i8, i8, i8, i8 }
+	%struct.BlendMode = type { i16, i16, i16, i16, %struct.IColor4, i16, i16, i8, i8, i8, i8 }
+	%struct.ClearColor = type { double, %struct.IColor4, %struct.IColor4, float, i32 }
+	%struct.ClipPlane = type { i32, [6 x %struct.IColor4] }
+	%struct.ColorBuffer = type { i16, i8, i8, [8 x i16], [0 x i32] }
+	%struct.ColorMatrix = type { [16 x float]*, %struct.ImagingColorScale }
+	%struct.Convolution = type { %struct.IColor4, %struct.ImagingColorScale, i16, i16, [0 x i32], float*, i32, i32 }
+	%struct.DepthTest = type { i16, i16, i8, i8, i8, i8, double, double }
+	%struct.FixedFunction = type { %struct.PPStreamToken* }
+	%struct.FogMode = type { %struct.IColor4, float, float, float, float, float, i16, i16, i16, i8, i8 }
+	%struct.HintMode = type { i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 }
+	%struct.Histogram = type { %struct.ProgramLimits*, i32, i16, i8, i8 }
+	%struct.ImagingColorScale = type { %struct.TCoord2, %struct.TCoord2, %struct.TCoord2, %struct.TCoord2 }
+	%struct.ImagingSubset = type { %struct.Convolution, %struct.Convolution, %struct.Convolution, %struct.ColorMatrix, %struct.Minmax, %struct.Histogram, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, i32, [0 x i32] }
+	%struct.Light = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.PointLineLimits, float, float, float, float, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, float, float, float, float }
+	%struct.LightModel = type { %struct.IColor4, [8 x %struct.Light], [2 x %struct.Material], i32, i16, i16, i16, i8, i8, i8, i8, i8, i8 }
+	%struct.LightProduct = type { %struct.IColor4, %struct.IColor4, %struct.IColor4 }
+	%struct.LineMode = type { float, i32, i16, i16, i8, i8, i8, i8 }
+	%struct.LogicOp = type { i16, i8, i8 }
+	%struct.MaskMode = type { i32, [3 x i32], i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.Material = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, float, float, float, float, [8 x %struct.LightProduct], %struct.IColor4, [8 x i32] }
+	%struct.Minmax = type { %struct.MinmaxTable*, i16, i8, i8, [0 x i32] }
+	%struct.MinmaxTable = type { %struct.IColor4, %struct.IColor4 }
+	%struct.Mipmaplevel = type { [4 x i32], [4 x i32], [4 x float], [4 x i32], i32, i32, float*, i8*, i16, i16, i16, i16, [2 x float] }
+	%struct.Multisample = type { float, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.PipelineProgramState = type { i8, i8, i8, i8, [0 x i32], %struct.IColor4* }
+	%struct.PixelMap = type { i32*, float*, float*, float*, float*, float*, float*, float*, float*, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+	%struct.PixelMode = type { float, float, %struct.PixelStore, %struct.PixelTransfer, %struct.PixelMap, %struct.ImagingSubset, i32, i32 }
+	%struct.PixelPack = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8 }
+	%struct.PixelStore = type { %struct.PixelPack, %struct.PixelPack }
+	%struct.PixelTransfer = type { float, float, float, float, float, float, float, float, float, float, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float }
+	%struct.PluginBufferData = type { i32 }
+	%struct.PointLineLimits = type { float, float, float }
+	%struct.PointMode = type { float, float, float, float, %struct.PointLineLimits, float, i8, i8, i8, i8, i16, i16, i32, i16, i16 }
+	%struct.PolygonMode = type { [128 x i8], float, float, i16, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.ProgramLimits = type { i32, i32, i32, i32 }
+	%struct.RegisterCombiners = type { i8, i8, i8, i8, i32, [2 x %struct.IColor4], [8 x %struct.RegisterCombinersPerStageState], %struct.RegisterCombinersFinalStageState }
+	%struct.RegisterCombinersFinalStageState = type { i8, i8, i8, i8, [7 x %struct.RegisterCombinersPerVariableState] }
+	%struct.RegisterCombinersPerPortionState = type { [4 x %struct.RegisterCombinersPerVariableState], i8, i8, i8, i8, i16, i16, i16, i16, i16, i16 }
+	%struct.RegisterCombinersPerStageState = type { [2 x %struct.RegisterCombinersPerPortionState], [2 x %struct.IColor4] }
+	%struct.RegisterCombinersPerVariableState = type { i16, i16, i16, i16 }
+	%struct.SWRSurfaceRec = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, [4 x i8*], i32 }
+	%struct.ScissorTest = type { %struct.ProgramLimits, i8, i8, i8, i8 }
+	%struct.State = type <{ i16, i16, i16, i16, i32, i32, [256 x %struct.IColor4], [128 x %struct.IColor4], %struct.Viewport, %struct.Transform, %struct.LightModel, %struct.ActiveTextureTargets, %struct.AlphaTest, %struct.BlendMode, %struct.ClearColor, %struct.ColorBuffer, %struct.DepthTest, %struct.ArrayRange, %struct.FogMode, %struct.HintMode, %struct.LineMode, %struct.LogicOp, %struct.MaskMode, %struct.PixelMode, %struct.PointMode, %struct.PolygonMode, %struct.ScissorTest, i32, %struct.StencilTest, [8 x %struct.TextureMode], [16 x %struct.TextureImageMode], %struct.ArrayRange, [8 x %struct.TextureCoordGen], %struct.ClipPlane, %struct.Multisample, %struct.RegisterCombiners, %struct.ArrayRange, %struct.ArrayRange, [3 x %struct.PipelineProgramState], %struct.ArrayRange, %struct.TransformFeedback, i32*, %struct.FixedFunction, [3 x i32], [3 x i32] }>
+	%struct.StencilTest = type { [3 x { i32, i32, i16, i16, i16, i16 }], i32, [4 x i8] }
+	%struct.TextureCoordGen = type { { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, i8, i8, i8, i8 }
+	%struct.TextureGeomState = type { i16, i16, i16, i16, i16, i8, i8, i8, i8, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [6 x i16], [6 x i16] }
+	%struct.TextureImageMode = type { float }
+	%struct.TextureLevel = type { i32, i32, i16, i16, i16, i8, i8, i16, i16, i16, i16, i8* }
+	%struct.TextureMode = type { %struct.IColor4, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, float, float, i16, i16, i16, i16, i16, i16, [4 x i16], i8, i8, i8, i8, [3 x float], [4 x float], float, float }
+	%struct.TextureParamState = type { i16, i16, i16, i16, i16, i16, %struct.IColor4, float, float, float, float, i16, i16, i16, i16, float, i16, i8, i8, i32, i8* }
+	%struct.TextureRec = type { [4 x float], %struct.TextureState*, %struct.Mipmaplevel*, %struct.Mipmaplevel*, float, float, float, float, i8, i8, i8, i8, i16, i16, i16, i16, i32, float, [2 x %struct.PPStreamToken] }
+	%struct.TextureState = type { i16, i8, i8, i16, i16, float, i32, %struct.SWRSurfaceRec*, %struct.TextureParamState, %struct.TextureGeomState, [0 x i32], i8*, i32, %struct.TextureLevel, [1 x [15 x %struct.TextureLevel]] }
+	%struct.Transform = type <{ [24 x [16 x float]], [24 x [16 x float]], [16 x float], float, float, float, float, float, i8, i8, i8, i8, i32, i32, i32, i16, i16, i8, i8, i8, i8, i32 }>
+	%struct.TransformFeedback = type { i8, i8, i8, i8, [0 x i32], [16 x i32], [16 x i32] }
+	%struct.Viewport = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, double, double, i32, i32, i32, i32, float, float, float, float }
+	%struct.IColor4 = type { float, float, float, float }
+	%struct.TCoord2 = type { float, float }
+	%struct.VMGPStack = type { [6 x <4 x float>*], <4 x float>*, i32, i32, <4 x float>*, <4 x float>**, i32, i32, i32, i32, i32, i32 }
+	%struct.VMTextures = type { [16 x %struct.TextureRec*] }
+	%struct.PPStreamToken = type { { i16, i16, i32 } }
+	%struct._VMConstants = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, float, float, float, float, float, float, float, float, float, float, float, float, [256 x float], [528 x i8], { void (i8*, i8*, i32, i8*)*, float (float)*, float (float)*, float (float)*, i32 (float)* } }
+
+define i32 @foo(%struct.State* %dst, <4 x float>* %prgrm, <4 x float>** %buffs, %struct._VMConstants* %cnstn, %struct.PPStreamToken* %pstrm, %struct.PluginBufferData* %gpctx, %struct.VMTextures* %txtrs, %struct.VMGPStack* %gpstk) nounwind {
+bb266.i:
+	getelementptr <4 x float>, <4 x float>* null, i32 11		; <<4 x float>*>:0 [#uses=1]
+	load <4 x float>, <4 x float>* %0, align 16		; <<4 x float>>:1 [#uses=1]
+	shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> < i32 0, i32 1, i32 1, i32 1 >		; <<4 x float>>:2 [#uses=1]
+	shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:3 [#uses=1]
+	shufflevector <4 x float> undef, <4 x float> poison, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:4 [#uses=1]
+	shufflevector <4 x float> %4, <4 x float> %3, <4 x i32> < i32 6, i32 7, i32 2, i32 3 >		; <<4 x float>>:5 [#uses=1]
+	fmul <4 x float> %5, zeroinitializer		; <<4 x float>>:6 [#uses=2]
+	fmul <4 x float> %6, %6		; <<4 x float>>:7 [#uses=1]
+	fadd <4 x float> zeroinitializer, %7		; <<4 x float>>:8 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> zeroinitializer, <4 x float> %8 ) nounwind readnone		; <<4 x float>>:9 [#uses=1]
+	%phitmp40 = bitcast <4 x float> %9 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%tmp4109.i = and <4 x i32> %phitmp40, < i32 8388607, i32 8388607, i32 8388607, i32 8388607 >		; <<4 x i32>> [#uses=1]
+	%tmp4116.i = or <4 x i32> %tmp4109.i, < i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216 >		; <<4 x i32>> [#uses=1]
+	%tmp4117.i = bitcast <4 x i32> %tmp4116.i to <4 x float>		; <<4 x float>> [#uses=1]
+	fadd <4 x float> %tmp4117.i, zeroinitializer		; <<4 x float>>:10 [#uses=1]
+	fmul <4 x float> %10, < float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01 >		; <<4 x float>>:11 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %11, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:12 [#uses=1]
+	call <4 x float> @llvm.x86.sse.min.ps( <4 x float> %12, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:13 [#uses=1]
+	%tmp4170.i = call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %13, <4 x float> zeroinitializer, i8 2 ) nounwind		; <<4 x float>> [#uses=1]
+	bitcast <4 x float> %tmp4170.i to <16 x i8>		; <<16 x i8>>:14 [#uses=1]
+	call i32 @llvm.x86.sse2.pmovmskb.128( <16 x i8> %14 ) nounwind readnone		; <i32>:15 [#uses=1]
+	icmp eq i32 %15, 0		; <i1>:16 [#uses=1]
+	br i1 %16, label %bb5574.i, label %bb4521.i
+
+bb4521.i:		; preds = %bb266.i
+	unreachable
+
+bb5574.i:		; preds = %bb266.i
+	unreachable
+}
+
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
index a4cf1351a787..281c69ab2d9d 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
@@ -13,13 +13,13 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8 immarg
 
 define double @elts_addsub_v2f64(<2 x double> %0, <2 x double> %1) {
 ; CHECK-LABEL: @elts_addsub_v2f64(
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP0:%.*]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
-  %3 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %4 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %3 = shufflevector <2 x double> %0, <2 x double> poison, <2 x i32> <i32 0, i32 0>
+  %4 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> <i32 1, i32 1>
   %5 = tail call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %3, <2 x double> %4)
   %6 = extractelement <2 x double> %5, i32 0
   ret double %6
@@ -31,8 +31,8 @@ define double @elts_addsub_v2f64_sub(<2 x double> %0, <2 x double> %1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 ; CHECK-NEXT:    ret double [[TMP4]]
 ;
-  %3 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %4 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %3 = shufflevector <2 x double> %0, <2 x double> poison, <2 x i32> <i32 0, i32 0>
+  %4 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> <i32 0, i32 0>
   %5 = tail call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %3, <2 x double> %4)
   %6 = extractelement <2 x double> %5, i32 0
   ret double %6
@@ -46,8 +46,8 @@ define float @elts_addsub_v4f32(<4 x float> %0, <4 x float> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
-  %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %4 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %3 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %4 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %5 = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %3, <4 x float> %4)
   %6 = extractelement <4 x float> %5, i32 0
   %7 = extractelement <4 x float> %5, i32 1
@@ -63,8 +63,8 @@ define float @elts_addsub_v4f32_add(<4 x float> %0, <4 x float> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
-  %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %4 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %3 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %4 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %5 = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %3, <4 x float> %4)
   %6 = extractelement <4 x float> %5, i32 1
   %7 = extractelement <4 x float> %5, i32 3
@@ -80,8 +80,8 @@ define double @elts_addsub_v4f64(<4 x double> %0, <4 x double> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
-  %3 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
-  %4 = shufflevector <4 x double> %1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %3 = shufflevector <4 x double> %0, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %4 = shufflevector <4 x double> %1, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %5 = tail call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %3, <4 x double> %4)
   %6 = extractelement <4 x double> %5, i32 0
   %7 = extractelement <4 x double> %5, i32 1
@@ -97,8 +97,8 @@ define double @elts_addsub_v4f64_add(<4 x double> %0, <4 x double> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
-  %3 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
-  %4 = shufflevector <4 x double> %1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %3 = shufflevector <4 x double> %0, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %4 = shufflevector <4 x double> %1, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %5 = tail call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %3, <4 x double> %4)
   %6 = extractelement <4 x double> %5, i32 1
   %7 = extractelement <4 x double> %5, i32 3
@@ -114,8 +114,8 @@ define float @elts_addsub_v8f32(<8 x float> %0, <8 x float> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
-  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
-  %4 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %3 = shufflevector <8 x float> %0, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %4 = shufflevector <8 x float> %1, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
   %5 = tail call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %3, <8 x float> %4)
   %6 = extractelement <8 x float> %5, i32 0
   %7 = extractelement <8 x float> %5, i32 1
@@ -131,8 +131,8 @@ define float @elts_addsub_v8f32_sub(<8 x float> %0, <8 x float> %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
-  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
-  %4 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %3 = shufflevector <8 x float> %0, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %4 = shufflevector <8 x float> %1, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
   %5 = tail call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %3, <8 x float> %4)
   %6 = extractelement <8 x float> %5, i32 0
   %7 = extractelement <8 x float> %5, i32 4
@@ -181,13 +181,13 @@ define double @PR48476_fsub(<2 x double> %x) {
 define double @PR48476_fadd_fsub(<2 x double> %x) {
 ; CHECK-LABEL: @PR48476_fadd_fsub(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], <double poison, double 0.000000e+00>
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[S]], [[X]]
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    ret double [[VECEXT]]
 ;
   %t1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> zeroinitializer, <2 x double> %x)
-  %s = shufflevector <2 x double> %t1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %s = shufflevector <2 x double> %t1, <2 x double> poison, <2 x i32> <i32 1, i32 0>
   %t2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %s, <2 x double> %x)
   %vecext = extractelement <2 x double> %t2, i32 0
   ret double %vecext

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-avx2-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx2-inseltpoison.ll
new file mode 100644
index 000000000000..2ffe84670e38
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx2-inseltpoison.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @identity_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_vpermps(
+; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %a
+}
+
+; Instcombine should be able to fold the following shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @zero_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles.
+
+define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles with undef mask elements.
+
+define <8 x i32> @undef_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @undef_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify simplify demanded elts.
+
+define <8 x i32> @elts_test_vpermd(<8 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %a1, i32 0
+  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+define <8 x float> @elts_test_vpermps(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = insertelement <8 x i32> %a1, i32 0, i32 7
+  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
new file mode 100644
index 000000000000..bc0b67921784
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+;
+; Vector Demanded Elts
+;
+
+; Only bottom 4 elements required.
+define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <4 x half>
+; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[CVTPH2PS]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  ret <4 x float> %2
+}
+
+; All 8 elements required.
+define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  ret <8 x float> %2
+}
+
+;
+; Constant Folding
+;
+
+define <4 x float> @fold_vcvtph2ps_128() {
+; CHECK-LABEL: @fold_vcvtph2ps_128(
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256() {
+; CHECK-LABEL: @fold_vcvtph2ps_256(
+; CHECK-NEXT:    ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <8 x float> %1
+}
+
+define <4 x float> @fold_vcvtph2ps_128_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_128_zero(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_256_zero(
+; CHECK-NEXT:    ret <8 x float> zeroinitializer
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <8 x float> %1
+}

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-muldq-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-muldq-inseltpoison.ll
new file mode 100644
index 000000000000..d7a2c82f5292
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-muldq-inseltpoison.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
+  ret <8 x i64> %1
+}
+
+;
+; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
+;
+
+define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i64> [[TMP8]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <2 x i64> %3, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
+  ret <4 x i64> %3
+}
+
+define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
+  ret <8 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
+  ret <2 x i64> %3
+}
+
+define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+; CHECK-NEXT:    ret <4 x i64> [[TMP10]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <4 x i64> %3, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  ret <4 x i64> %4
+}
+
+define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <8 x i64> %3, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+  ret <8 x i64> %4
+}
+
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
index 5f3b991be512..ff8842cd15ea 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
@@ -208,26 +208,26 @@ define <64 x i8> @fold_packuswb_512() {
 define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
-  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
   %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
+  %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
   ret <8 x i16> %4
 }
 
 define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
   %1 = insertelement <4 x i32> %a0, i32 0, i32 0
   %2 = insertelement <4 x i32> %a1, i32 0, i32 3
   %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
-  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
   ret <8 x i16> %4
 }
 
@@ -238,7 +238,7 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
   %1 = insertelement <8 x i16> %a0, i16 0, i32 0
   %2 = insertelement <8 x i16> %a1, i16 0, i32 0
   %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %4 = shufflevector <16 x i8> %3, <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %4
 }
 
@@ -249,34 +249,34 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
   %1 = insertelement <8 x i16> poison, i16 0, i32 0
   %2 = insertelement <8 x i16> poison, i16 0, i32 0
   %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
-  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %4 = shufflevector <16 x i8> %3, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i8> %4
 }
 
 define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> poison, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
   %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
+  %4 = shufflevector <16 x i16> %3, <16 x i16> poison, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
   ret <16 x i16> %4
 }
 
 define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> poison, <8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
-  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
-  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = shufflevector <16 x i16> %3, <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i16> %4
 }
 
@@ -287,7 +287,7 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
   %1 = insertelement <16 x i16> %a0, i16 0, i32 0
   %2 = insertelement <16 x i16> %a1, i16 0, i32 8
   %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %4 = shufflevector <32 x i8> %3, <32 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
   ret <32 x i8> %4
 }
 
@@ -298,34 +298,34 @@ define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
   %1 = insertelement <16 x i16> poison, i16 0, i32 1
   %2 = insertelement <16 x i16> poison, i16 0, i32 0
   %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
-  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
+  %4 = shufflevector <32 x i8> %3, <32 x i8> poison, <32 x i32> zeroinitializer
   ret <32 x i8> %4
 }
 
 define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 undef, i32 undef, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> poison, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 undef, i32 undef, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> poison, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
   %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  %4 = shufflevector <32 x i16> %3, <32 x i16> poison, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
   ret <32 x i16> %4
 }
 
 define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> poison, <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
 ;
-  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
-  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = shufflevector <32 x i16> %3, <32 x i16> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i16> %4
 }
 
@@ -338,7 +338,7 @@ define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
   %3 = insertelement <32 x i16> %1, i16 0, i32 16
   %4 = insertelement <32 x i16> %2, i16 0, i32 24
   %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
-  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  %6 = shufflevector <64 x i8> %5, <64 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
   ret <64 x i8> %6
 }
 
@@ -349,7 +349,7 @@ define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
   %1 = insertelement <32 x i16> poison, i16 0, i32 1
   %2 = insertelement <32 x i16> poison, i16 0, i32 0
   %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
-  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  %4 = shufflevector <64 x i8> %3, <64 x i8> poison, <64 x i32> zeroinitializer
   ret <64 x i8> %4
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-pshufb-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-pshufb-inseltpoison.ll
new file mode 100644
index 000000000000..f5094f85a8a4
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pshufb-inseltpoison.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <16 x i8> @identity_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test(
+; CHECK-NEXT:    ret <16 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2(
+; CHECK-NEXT:    ret <32 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512(
+; CHECK-NEXT:    ret <64 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+; Verify that instcombine is able to fold byte shuffles with zero masks.
+
+define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <16 x i8> @splat_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @splat_test(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+; In the test case below, elements in the low 128-bit lane of the result
+; vector are equal to the lower byte of %InVec (shuffle index 0).
+; Elements in the high 128-bit lane of the result vector are equal to
+; the lower byte in the high 128-bit lane of %InVec (shuffle index 16).
+
+define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @splat_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+; Each of the byte shuffles in the following tests is equivalent to a blend between
+; vector %InVec and a vector of all zeroes.
+
+define <16 x i8> @blend1(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend2(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend3(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend4(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend4(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend5(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend5(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend6(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend6(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @blend1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 32, i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 48, i32 17, i32 48, i32 19, i32 48, i32 21, i32 48, i32 23, i32 48, i32 25, i32 48, i32 27, i32 48, i32 29, i32 48, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 32, i32 32, i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 48, i32 48, i32 18, i32 19, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 26, i32 27, i32 48, i32 48, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 20, i32 21, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend4_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend5_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend6_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @blend1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 64, i32 1, i32 64, i32 3, i32 64, i32 5, i32 64, i32 7, i32 64, i32 9, i32 64, i32 11, i32 64, i32 13, i32 64, i32 15, i32 80, i32 17, i32 80, i32 19, i32 80, i32 21, i32 80, i32 23, i32 80, i32 25, i32 80, i32 27, i32 80, i32 29, i32 80, i32 31, i32 96, i32 33, i32 96, i32 35, i32 96, i32 37, i32 96, i32 39, i32 96, i32 41, i32 96, i32 43, i32 96, i32 45, i32 96, i32 47, i32 112, i32 49, i32 112, i32 51, i32 112, i32 53, i32 112, i32 55, i32 112, i32 57, i32 112, i32 59, i32 112, i32 61, i32 112, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 64, i32 64, i32 2, i32 3, i32 64, i32 64, i32 6, i32 7, i32 64, i32 64, i32 10, i32 11, i32 64, i32 64, i32 14, i32 15, i32 80, i32 80, i32 18, i32 19, i32 80, i32 80, i32 22, i32 23, i32 80, i32 80, i32 26, i32 27, i32 80, i32 80, i32 30, i32 31, i32 96, i32 96, i32 34, i32 35, i32 96, i32 96, i32 38, i32 39, i32 96, i32 96, i32 42, i32 43, i32 96, i32 96, i32 46, i32 47, i32 112, i32 112, i32 50, i32 51, i32 112, i32 112, i32 54, i32 55, i32 112, i32 112, i32 58, i32 59, i32 112, i32 112, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 20, i32 21, i32 22, i32 23, i32 80, i32 80, i32 80, i32 80, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 36, i32 37, i32 38, i32 39, i32 96, i32 96, i32 96, i32 96, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 52, i32 53, i32 54, i32 55, i32 112, i32 112, i32 112, i32 112, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend4_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend5_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 18, i32 19, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 34, i32 35, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 50, i32 51, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend6_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 1, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128,i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; movq idiom.
+define <16 x i8> @movq_idiom(<16 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @movq_idiom_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Vector permutations using byte shuffles.
+
+define <16 x i8> @permute1(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @permute2(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @permute2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @permute2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <64 x i8> %1
+}
+
+; Test that instcombine correctly folds a pshufb with values that
+; are not -128 and that are not encoded in four bits.
+
+define <16 x i8> @identity_test2_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test2_2(
+; CHECK-NEXT:    ret <16 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> [[INVEC:%.*]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63, i8 96, i8 49, i8 66, i8 99, i8 52, i8 69, i8 102, i8 55, i8 72, i8 105, i8 58, i8 75, i8 108, i8 61, i8 78, i8 111, i8 64, i8 81, i8 114, i8 67, i8 84, i8 117, i8 70, i8 87, i8 120, i8 73, i8 90, i8 123, i8 76, i8 93, i8 126, i8 79>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_to_zero_vector_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_2(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -125, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16, i8 -125, i8 -3, i8 -51, i8 -30, i8 -6, i8 -9, i8 -35, i8 -68, i8 -101, i8 -118, i8 -102, i8 -24, i8 -15, i8 -3, i8 -13, i8 -17, i8 -124, i8 -4, i8 -56, i8 -29, i8 -7, i8 -10, i8 -36, i8 -69, i8 -102, i8 -117, i8 -103, i8 -25, i8 -14, i8 -4, i8 -14, i8 -18>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @permute3(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 48, i8 17, i8 34, i8 51, i8 20, i8 37, i8 54, i8 23, i8 16, i8 49, i8 66, i8 19, i8 52, i8 69, i8 22, i8 55>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111, i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 108, i8 109, i8 110, i8 111, i8 124, i8 125, i8 126, i8 127>)
+  ret <64 x i8> %1
+}
+
+; FIXME: Verify that instcombine is able to fold constant byte shuffles with undef mask elements.
+
+define <16 x i8> @fold_with_undef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 16, i32 undef, i32 16, i32 1, i32 16, i32 undef, i32 16, i32 2, i32 16, i32 undef, i32 16, i32 3, i32 16, i32 undef, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_undef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[INVEC:%.*]], <32 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 32, i32 undef, i32 32, i32 1, i32 32, i32 undef, i32 32, i32 2, i32 32, i32 undef, i32 32, i32 3, i32 32, i32 undef, i32 32, i32 16, i32 48, i32 undef, i32 48, i32 17, i32 48, i32 undef, i32 48, i32 18, i32 48, i32 undef, i32 48, i32 19, i32 48, i32 undef, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_undef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[INVEC:%.*]], <64 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 64, i32 undef, i32 64, i32 1, i32 64, i32 undef, i32 64, i32 2, i32 64, i32 undef, i32 64, i32 3, i32 64, i32 undef, i32 64, i32 16, i32 80, i32 undef, i32 80, i32 17, i32 80, i32 undef, i32 80, i32 18, i32 80, i32 undef, i32 80, i32 19, i32 80, i32 undef, i32 80, i32 32, i32 96, i32 undef, i32 96, i32 33, i32 96, i32 undef, i32 96, i32 34, i32 96, i32 undef, i32 96, i32 35, i32 96, i32 undef, i32 96, i32 48, i32 112, i32 undef, i32 112, i32 49, i32 112, i32 undef, i32 112, i32 50, i32 112, i32 undef, i32 112, i32 51, i32 112, i32 undef, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_with_allundef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> undef)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_allundef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx2(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> undef)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> undef)
+  ret <64 x i8> %1
+}
+
+; Demanded elts tests.
+
+define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
+; CHECK-LABEL: @demanded_elts_insertion(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> [[INVEC:%.*]], <16 x i8> [[BASEMASK:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %M15, i32 15
+  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+  ret <16 x i8> %4
+}
+
+define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
+; CHECK-LABEL: @demanded_elts_insertion_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[INVEC:%.*]], <32 x i8> [[BASEMASK:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
+  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> poison, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %4
+}
+
+define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
+; CHECK-LABEL: @demanded_elts_insertion_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <64 x i8> poison, i8 [[M0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[INVEC:%.*]], <64 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <64 x i8> %1, i8 %M30, i32 30
+  %3 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> poison, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll
new file mode 100644
index 000000000000..3270ca134db8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse4a-inseltpoison.ll
@@ -0,0 +1,420 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+;
+; EXTRQ
+;
+
+define <2 x i64> @test_extrq_call(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> [[X:%.*]], <16 x i8> [[Y:%.*]]) [[ATTR1:#.*]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_zero_arg0(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg0(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> zeroinitializer, <16 x i8> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_zero_arg1(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_zero_arg1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[X:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> zeroinitializer) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_to_extqi(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_to_extqi(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> [[X:%.*]], i8 8, i8 15)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_constant(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 255, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 55>, <16 x i8> <i8 8, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_constant_undef(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 65535, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> <i64 -1, i64 undef>, <16 x i8> <i8 16, i8 15, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrq_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrq_call_constexpr(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[X:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> bitcast (<2 x i64> <i64 0, i64 undef> to <16 x i8>))
+  ret <2 x i64> %1
+}
+
+;
+; EXTRQI
+;
+
+define <2 x i64> @test_extrqi_call(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> [[X:%.*]], i8 8, i8 23)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 23)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_shuffle_1zuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_1zuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[X:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 32, i8 32)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_shuffle_2zzzzzzzuuuuuuuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[X:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> <i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 2, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_undef(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 32, i8 33)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_zero(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_zero(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> zeroinitializer, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_constant(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 7, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 55>, i8 3, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_constant_undef(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 15, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> <i64 -1, i64 undef>, i8 4, i8 18)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_extrqi_call_constexpr() {
+; CHECK-LABEL: @test_extrqi_call_constexpr(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 8, i8 16)
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQ
+;
+
+define <2 x i64> @test_insertq_call(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_to_insertqi(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_to_insertqi(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> <i64 8, i64 poison>, i8 18, i8 2)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_constant(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 32, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 0, i64 0>, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_constant_undef(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_constant_undef(
+; CHECK-NEXT:    ret <2 x i64> <i64 33, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> <i64 1, i64 undef>, <2 x i64> <i64 8, i64 658>) nounwind
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertq_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_insertq_call_constexpr(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> <i64 0, i64 poison>, i8 2, i8 0)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>))
+  ret <2 x i64> %1
+}
+
+;
+; INSERTQI
+;
+
+define <16 x i8> @test_insertqi_shuffle_04uu(<16 x i8> %v, <16 x i8> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_04uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V:%.*]], <16 x i8> [[I:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = bitcast <16 x i8> %v to <2 x i64>
+  %2 = bitcast <16 x i8> %i to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 32, i8 32)
+  %4 = bitcast <2 x i64> %3 to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @test_insertqi_shuffle_8123uuuu(<16 x i8> %v, <16 x i8> %i) {
+; CHECK-LABEL: @test_insertqi_shuffle_8123uuuu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[I:%.*]], <16 x i8> [[V:%.*]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = bitcast <16 x i8> %v to <2 x i64>
+  %2 = bitcast <16 x i8> %i to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 16, i8 0)
+  %4 = bitcast <2 x i64> %3 to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <2 x i64> @test_insertqi_constant(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @test_insertqi_constant(
+; CHECK-NEXT:    ret <2 x i64> <i64 -131055, i64 undef>
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 8, i64 0>, i8 16, i8 1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_insertqi_call_constexpr(<2 x i64> %x) {
+; CHECK-LABEL: @test_insertqi_call_constexpr(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> <i64 0, i64 poison>, i8 48, i8 3)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> bitcast (<16 x i8> trunc (<16 x i16> bitcast (<4 x i64> <i64 0, i64 undef, i64 2, i64 undef> to <16 x i16>) to <16 x i8>) to <2 x i64>), i8 48, i8 3)
+  ret <2 x i64> %1
+}
+
+; The result of this insert is the second arg, since the top 64 bits of
+; the result are undefined, and we copy the bottom 64 bits from the
+; second arg
+define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testInsert64Bits(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[I:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testZeroLength(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[I:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_1(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_2(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
+; CHECK-LABEL: @testUndefinedInsertq_3(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
+  ret <2 x i64> %1
+}
+
+;
+; Vector Demanded Bits
+;
+
+define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> [[X:%.*]], <16 x i8> [[Y:%.*]]) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_arg1(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> [[X:%.*]], <16 x i8> [[Y:%.*]]) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <16 x i8> %y, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_args01(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> [[X:%.*]], <16 x i8> [[Y:%.*]]) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = shufflevector <16 x i8> %y, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) {
+; CHECK-LABEL: @test_extrq_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> [[X:%.*]], i8 3, i8 2)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_extrqi_ret(<2 x i64> %x) {
+; CHECK-LABEL: @test_extrqi_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertq_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_arg0(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], i8 3, i8 2) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_arg1(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], i8 3, i8 2) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %y, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_args01(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], i8 3, i8 2) [[ATTR1]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %2 = shufflevector <2 x i64> %y, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @test_insertqi_ret(
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind
+  %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %2
+}
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
index e23e222748c2..38a3a6f5c70b 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
@@ -2923,12 +2923,12 @@ define <4 x i32> @avx2_psrav_d_128_masked(<4 x i32> %v, <4 x i32> %a) {
 define <4 x i32> @avx2_psrav_d_128_masked_shuffle(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-LABEL: @avx2_psrav_d_128_masked_shuffle(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], <i32 poison, i32 poison, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[V:%.*]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %1 = and <4 x i32> %a, <i32 undef, i32 undef, i32 15, i32 31>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   %3 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> %2)
   ret <4 x i32> %3
 }
@@ -3030,7 +3030,7 @@ define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
   ret <8 x i16> %2
 }
@@ -3041,7 +3041,7 @@ define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> [[V:%.*]], <8 x i16> [[TMP1]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <8 x i16>
   %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2)
   ret <8 x i16> %3
@@ -3052,7 +3052,7 @@ define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
   ret <4 x i32> %2
 }
@@ -3063,7 +3063,7 @@ define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> [[V:%.*]], <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = bitcast <8 x i16> %1 to <4 x i32>
   %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2)
   ret <4 x i32> %3
@@ -3074,7 +3074,7 @@ define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
   ret <16 x i16> %2
 }
@@ -3084,7 +3084,7 @@ define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
   ret <8 x i32> %2
 }
@@ -3094,7 +3094,7 @@ define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1)
   ret <2 x i64> %2
 }
@@ -3104,7 +3104,7 @@ define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1)
   ret <4 x i64> %2
 }
@@ -3114,7 +3114,7 @@ define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1)
   ret <32 x i16> %2
 }
@@ -3124,7 +3124,7 @@ define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1)
   ret <16 x i32> %2
 }
@@ -3134,7 +3134,7 @@ define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1)
   ret <8 x i64> %2
 }
@@ -3144,7 +3144,7 @@ define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
   ret <8 x i16> %2
 }
@@ -3154,7 +3154,7 @@ define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
   ret <4 x i32> %2
 }
@@ -3164,7 +3164,7 @@ define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
   ret <2 x i64> %2
 }
@@ -3174,7 +3174,7 @@ define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
   ret <16 x i16> %2
 }
@@ -3185,7 +3185,7 @@ define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[V:%.*]], <8 x i16> [[TMP1]])
 ; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = shufflevector <16 x i8> %a, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2)
   ret <16 x i16> %3
@@ -3196,7 +3196,7 @@ define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
   ret <8 x i32> %2
 }
@@ -3207,7 +3207,7 @@ define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> [[V:%.*]], <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2)
   ret <8 x i32> %3
@@ -3218,7 +3218,7 @@ define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
   ret <4 x i64> %2
 }
@@ -3228,7 +3228,7 @@ define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1)
   ret <32 x i16> %2
 }
@@ -3239,7 +3239,7 @@ define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> [[V:%.*]], <8 x i16> [[TMP1]])
 ; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
 ;
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = shufflevector <16 x i8> %a, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %2)
   ret <32 x i16> %3
@@ -3250,7 +3250,7 @@ define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1)
   ret <16 x i32> %2
 }
@@ -3261,7 +3261,7 @@ define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[V:%.*]], <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %2)
   ret <16 x i32> %3
@@ -3272,7 +3272,7 @@ define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1)
   ret <8 x i64> %2
 }
@@ -3282,7 +3282,7 @@ define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
   ret <8 x i16> %2
 }
@@ -3292,7 +3292,7 @@ define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
   ret <4 x i32> %2
 }
@@ -3302,7 +3302,7 @@ define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
   ret <2 x i64> %2
 }
@@ -3312,7 +3312,7 @@ define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
   ret <16 x i16> %2
 }
@@ -3322,7 +3322,7 @@ define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
   ret <8 x i32> %2
 }
@@ -3332,7 +3332,7 @@ define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
   ret <4 x i64> %2
 }
@@ -3342,7 +3342,7 @@ define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> [[V:%.*]], <8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
 ;
-  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1)
   ret <32 x i16> %2
 }
@@ -3352,7 +3352,7 @@ define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[V:%.*]], <4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1)
   ret <16 x i32> %2
 }
@@ -3362,7 +3362,7 @@ define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[V:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
-  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
   %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1)
   ret <8 x i64> %2
 }

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermil-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermil-inseltpoison.ll
new file mode 100644
index 000000000000..f633a3d43569
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermil-inseltpoison.ll
@@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps(
+; CHECK-NEXT:    ret <4 x float> [[V:%.*]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps_256(
+; CHECK-NEXT:    ret <8 x float> [[V:%.*]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_ps_512(
+; CHECK-NEXT:    ret <16 x float> [[V:%.*]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd(
+; CHECK-NEXT:    ret <2 x double> [[V:%.*]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd_256(
+; CHECK-NEXT:    ret <4 x double> [[V:%.*]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @identity_test_vpermilvar_pd_512(
+; CHECK-NEXT:    ret <8 x double> [[V:%.*]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>)
+  ret <8 x double> %a
+}
+
+; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
+  ret <4 x float> %a
+}
+
+define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer)
+  ret <16 x float> %a
+}
+
+define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
+  ret <2 x double> %a
+}
+
+define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
+  ret <4 x double> %a
+}
+
+define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) {
+; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer)
+  ret <8 x double> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles.
+
+define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>)
+  ret <8 x double> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles with undef mask elements.
+
+define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>)
+  ret <4 x float> %a
+}
+
+define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 10, i32 9, i32 undef, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %a
+}
+
+define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>)
+  ret <2 x double> %a
+}
+
+define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 0, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>)
+  ret <4 x double> %a
+}
+
+define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) {
+; CHECK-LABEL: @undef_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 0, i32 3, i32 undef, i32 undef, i32 4, i32 7, i32 undef>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>)
+  ret <8 x double> %a
+}
+
+; Simplify demanded elts
+
+define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3
+  %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1)
+  %3 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %3
+}
+
+define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 6, i32 undef, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> poison, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7>
+  ret <8 x float> %3
+}
+
+define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[A0:%.*]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0
+  %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1)
+  %3 = shufflevector <16 x float> %2, <16 x float> poison, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %3
+}
+
+define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1)
+  %3 = shufflevector <2 x double> %2, <2 x double> poison, <2 x i32> <i32 0, i32 undef>
+  ret <2 x double> %3
+}
+
+define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1)
+  %3 = shufflevector <4 x double> %2, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x double> %3
+}
+
+define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) {
+; CHECK-LABEL: @elts_test_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> poison, i64 [[A2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[A0:%.*]], <8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0
+  %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1)
+  %3 = shufflevector <8 x double> %2, <8 x double> poison, <8 x i32> zeroinitializer
+  ret <8 x double> %3
+}
+
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
+declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)

diff  --git a/llvm/test/Transforms/InstCombine/assume-inseltpoison.ll b/llvm/test/Transforms/InstCombine/assume-inseltpoison.ll
new file mode 100644
index 000000000000..8c04c4af28ce
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/assume-inseltpoison.ll
@@ -0,0 +1,656 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S -instcombine-infinite-loop-threshold=2 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.assume(i1) #1
+
+; Check that the alignment has been upgraded and that the assume has not
+; been removed:
+
+define i32 @foo1(i32* %a) #0 {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:    [[T0:%.*]] = load i32, i32* [[A:%.*]], align 32
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    ret i32 [[T0]]
+;
+  %t0 = load i32, i32* %a, align 4
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  ret i32 %t0
+}
+
+; Same check as in @foo1, but make sure it works if the assume is first too.
+
+define i32 @foo2(i32* %a) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A:%.*]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    [[T0:%.*]] = load i32, i32* [[A]], align 32
+; CHECK-NEXT:    ret i32 [[T0]]
+;
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %t0 = load i32, i32* %a, align 4
+  ret i32 %t0
+}
+
+define i32 @simple(i32 %a) #1 {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 4
+;
+  %cmp = icmp eq i32 %a, 4
+  tail call void @llvm.assume(i1 %cmp)
+  ret i32 %a
+}
+
+define i32 @can1(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: @can1(
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A:%.*]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B:%.*]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C:%.*]])
+; CHECK-NEXT:    ret i32 5
+;
+  %and1 = and i1 %a, %b
+  %and  = and i1 %and1, %c
+  tail call void @llvm.assume(i1 %and)
+  ret i32 5
+}
+
+define i32 @can2(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: @can2(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[A:%.*]], true
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[B:%.*]], true
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP2]])
+; CHECK-NEXT:    ret i32 5
+;
+  %v = or i1 %a, %b
+  %w = xor i1 %v, 1
+  tail call void @llvm.assume(i1 %w)
+  ret i32 5
+}
+
+define i32 @bar1(i32 %a) #0 {
+; CHECK-LABEL: @bar1(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 1
+;
+  %and1 = and i32 %a, 3
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+  ret i32 %and1
+}
+
+define i32 @bar2(i32 %a) #0 {
+; CHECK-LABEL: @bar2(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 1
+;
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+  %and1 = and i32 %a, 3
+  ret i32 %and1
+}
+
+define i32 @bar3(i32 %a, i1 %x, i1 %y) #0 {
+; CHECK-LABEL: @bar3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[X:%.*]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %and1 = and i32 %a, 3
+
+; Don't be fooled by other assumes around.
+
+  tail call void @llvm.assume(i1 %x)
+
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+
+  tail call void @llvm.assume(i1 %y)
+
+  ret i32 %and1
+}
+
+define i32 @bar4(i32 %a, i32 %b) {
+; CHECK-LABEL: @bar4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %and1 = and i32 %b, 3
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp2 = icmp eq i32 %a, %b
+  tail call void @llvm.assume(i1 %cmp2)
+  ret i32 %and1
+}
+
+define i32 @icmp1(i32 %a) #0 {
+; CHECK-LABEL: @icmp1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 1
+;
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp2(i32 %a) #0 {
+; CHECK-LABEL: @icmp2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 0
+;
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %t0 = zext i1 %cmp to i32
+  %lnot.ext = xor i32 %t0, 1
+  ret i32 %lnot.ext
+}
+
+; If the 'not' of a condition is known true, then the condition must be false.
+
+define i1 @assume_not(i1 %cond) {
+; CHECK-LABEL: @assume_not(
+; CHECK-NEXT:    [[NOTCOND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
+; CHECK-NEXT:    ret i1 false
+;
+  %notcond = xor i1 %cond, true
+  call void @llvm.assume(i1 %notcond)
+  ret i1 %cond
+}
+
+declare void @escape(i32* %a)
+
+; Canonicalize a nonnull assumption on a load into metadata form.
+
+define i32 @bundle1(i32* %P) {
+; CHECK-LABEL: @bundle1(
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "nonnull"(i32* [[P:%.*]]) ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  tail call void @llvm.assume(i1 true) ["nonnull"(i32* %P)]
+  %load = load i32, i32* %P
+  ret i32 %load
+}
+
+define i32 @bundle2(i32* %P) {
+; CHECK-LABEL: @bundle2(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  tail call void @llvm.assume(i1 true) ["ignore"(i32* undef)]
+  %load = load i32, i32* %P
+  ret i32 %load
+}
+
+define i1 @nonnull1(i32** %a) {
+; CHECK-LABEL: @nonnull1(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8, !nonnull !6
+; CHECK-NEXT:    tail call void @escape(i32* nonnull [[LOAD]])
+; CHECK-NEXT:    ret i1 false
+;
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  tail call void @llvm.assume(i1 %cmp)
+  tail call void @escape(i32* %load)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+}
+
+; Make sure the above canonicalization applies only
+; to pointer types.  Doing otherwise would be illegal.
+
+define i1 @nonnull2(i32* %a) {
+; CHECK-LABEL: @nonnull2(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[LOAD]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 false
+;
+  %load = load i32, i32* %a
+  %cmp = icmp ne i32 %load, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32 %load, 0
+  ret i1 %rval
+}
+
+; Make sure the above canonicalization does not trigger
+; if the assume is control dependent on something else
+
+define i1 @nonnull3(i32** %a, i1 %control) {
+; CHECK-LABEL: @nonnull3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 false
+; CHECK:       not_taken:
+; CHECK-NEXT:    [[RVAL_2:%.*]] = icmp sgt i32* [[LOAD]], null
+; CHECK-NEXT:    ret i1 [[RVAL_2]]
+;
+entry:
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+not_taken:
+  %rval.2 = icmp sgt i32* %load, null
+  ret i1 %rval.2
+}
+
+; Make sure the above canonicalization does not trigger
+; if the path from the load to the assume is potentially
+; interrupted by an exception being thrown
+
+define i1 @nonnull4(i32** %a) {
+; CHECK-LABEL: @nonnull4(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    tail call void @escape(i32* [[LOAD]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 false
+;
+  %load = load i32*, i32** %a
+  ;; This call may throw!
+  tail call void @escape(i32* %load)
+  %cmp = icmp ne i32* %load, null
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+}
+define i1 @nonnull5(i32** %a) {
+; CHECK-LABEL: @nonnull5(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    tail call void @escape(i32* [[LOAD]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32* [[LOAD]], null
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 false
+;
+  %load = load i32*, i32** %a
+  ;; This call may throw!
+  tail call void @escape(i32* %load)
+  %integral = ptrtoint i32* %load to i64
+  %cmp = icmp slt i64 %integral, 0
+  tail call void @llvm.assume(i1 %cmp) ; %load has at least highest bit set
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+}
+
+; PR35846 - https://bugs.llvm.org/show_bug.cgi?id=35846
+
+define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) {
+; CHECK-LABEL: @assumption_conflicts_with_known_bits(
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[B:%.*]], 3
+; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[AND1]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    ret i32 0
+;
+  %and1 = and i32 %b, 3
+  %B1 = lshr i32 %and1, %and1
+  %B3 = shl nuw nsw i32 %and1, %B1
+  %cmp = icmp eq i32 %B3, 1
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp2 = icmp eq i32 %B1, %B3
+  tail call void @llvm.assume(i1 %cmp2)
+  ret i32 %and1
+}
+
+; PR37726 - https://bugs.llvm.org/show_bug.cgi?id=37726
+; There's a loophole in eliminating a redundant assumption when
+; we have conflicting assumptions. Verify that debuginfo doesn't
+; get in the way of the fold.
+
+define void @debug_interference(i8 %x) {
+; CHECK-LABEL: @debug_interference(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 5, [[META7:metadata !.*]], metadata !DIExpression()), [[DBG9:!dbg !.*]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 5, [[META7]], metadata !DIExpression()), [[DBG9]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 5, [[META7]], metadata !DIExpression()), [[DBG9]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %x, 0
+  tail call void @llvm.assume(i1 %cmp1)
+  tail call void @llvm.dbg.value(metadata i32 5, metadata !1, metadata !DIExpression()), !dbg !9
+  tail call void @llvm.assume(i1 %cmp1)
+  tail call void @llvm.dbg.value(metadata i32 5, metadata !1, metadata !DIExpression()), !dbg !9
+  tail call void @llvm.assume(i1 %cmp2)
+  tail call void @llvm.dbg.value(metadata i32 5, metadata !1, metadata !DIExpression()), !dbg !9
+  tail call void @llvm.assume(i1 %cmp2)
+  ret void
+}
+
+; This would crash.
+; Does it ever make sense to peek through a bitcast of the icmp operand?
+
+define i32 @PR40940(<4 x i8> %x) {
+; CHECK-LABEL: @PR40940(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <4 x i8> [[SHUF]] to i32
+; CHECK-NEXT:    [[T3:%.*]] = icmp ult i32 [[T2]], 65536
+; CHECK-NEXT:    call void @llvm.assume(i1 [[T3]])
+; CHECK-NEXT:    ret i32 [[T2]]
+;
+  %shuf = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+  %t2 = bitcast <4 x i8> %shuf to i32
+  %t3 = icmp ult i32 %t2, 65536
+  call void @llvm.assume(i1 %t3)
+  ret i32 %t2
+}
+
+define i1 @nonnull3A(i32** %a, i1 %control) {
+; CHECK-LABEL: @nonnull3A(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+; CHECK:       not_taken:
+; CHECK-NEXT:    [[RVAL_2:%.*]] = icmp sgt i32* [[LOAD]], null
+; CHECK-NEXT:    ret i1 [[RVAL_2]]
+;
+entry:
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  call void @llvm.assume(i1 %cmp)
+  ret i1 %cmp
+not_taken:
+  call void @llvm.assume(i1 %cmp)
+  %rval.2 = icmp sgt i32* %load, null
+  ret i1 %rval.2
+}
+
+define i1 @nonnull3B(i32** %a, i1 %control) {
+; CHECK-LABEL: @nonnull3B(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]]) [ "nonnull"(i32* [[LOAD]]), "nonnull"(i1 [[CMP]]) ]
+; CHECK-NEXT:    ret i1 true
+; CHECK:       not_taken:
+; CHECK-NEXT:    ret i1 [[CONTROL]]
+;
+entry:
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  call void @llvm.assume(i1 %cmp) ["nonnull"(i32* %load), "nonnull"(i1 %cmp)]
+  ret i1 %cmp
+not_taken:
+  call void @llvm.assume(i1 %cmp) ["nonnull"(i32* %load), "nonnull"(i1 %cmp)]
+  ret i1 %control
+}
+
+declare i1 @tmp1(i1)
+
+define i1 @nonnull3C(i32** %a, i1 %control) {
+; CHECK-LABEL: @nonnull3C(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    [[CMP2:%.*]] = call i1 @tmp1(i1 [[CMP]])
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       not_taken:
+; CHECK-NEXT:    ret i1 [[CONTROL]]
+;
+entry:
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  %cmp2 = call i1 @tmp1(i1 %cmp)
+  br label %exit
+exit:
+  ; FIXME: this shouldn't be dropped because it is still dominated by the new position of %load
+  call void @llvm.assume(i1 %cmp) ["nonnull"(i32* %load), "nonnull"(i1 %cmp)]
+  ret i1 %cmp2
+not_taken:
+  call void @llvm.assume(i1 %cmp)
+  ret i1 %control
+}
+
+define i1 @nonnull3D(i32** %a, i1 %control) {
+; CHECK-LABEL: @nonnull3D(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]]
+; CHECK:       taken:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32*, i32** [[A:%.*]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32* [[LOAD]], null
+; CHECK-NEXT:    [[CMP2:%.*]] = call i1 @tmp1(i1 [[CMP]])
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       not_taken:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "ignore"(i32* undef), "ignore"(i1 undef), "nonnull"(i1 [[CONTROL]]) ]
+; CHECK-NEXT:    ret i1 [[CONTROL]]
+;
+entry:
+  %load = load i32*, i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  %cmp2 = call i1 @tmp1(i1 %cmp)
+  br label %exit
+exit:
+  ret i1 %cmp2
+not_taken:
+  call void @llvm.assume(i1 %cmp) ["nonnull"(i32* %load), "nonnull"(i1 %cmp), "nonnull"(i1 %control)]
+  ret i1 %control
+}
+
+
+define void @always_true_assumption() {
+; CHECK-LABEL: @always_true_assumption(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true)
+  ret void
+}
+
+; The alloca guarantees that the low bits of %a are zero because of alignment.
+; The assume says the opposite. Make sure we don't crash.
+
+define i64 @PR31809() {
+; CHECK-LABEL: @PR31809(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[T1:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    call void @llvm.assume(i1 false)
+; CHECK-NEXT:    ret i64 [[T1]]
+;
+  %a = alloca i32
+  %t1 = ptrtoint i32* %a to i64
+  %cond = icmp eq i64 %t1, 3
+  call void @llvm.assume(i1 %cond)
+  ret i64 %t1
+}
+
+; Similar to above: there's no way to know which assumption is truthful,
+; so just don't crash.
+
+define i8 @conflicting_assumptions(i8 %x){
+; CHECK-LABEL: @conflicting_assumptions(
+; CHECK-NEXT:    call void @llvm.assume(i1 false)
+; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i8 [[X:%.*]], 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND2]])
+; CHECK-NEXT:    ret i8 5
+;
+  %add = add i8 %x, 1
+  %cond1 = icmp eq i8 %x, 3
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp eq i8 %x, 4
+  call void @llvm.assume(i1 %cond2)
+  ret i8 %add
+}
+
+; Another case of conflicting assumptions. This would crash because we'd
+; try to set more known bits than existed in the known bits struct.
+
+define void @PR36270(i32 %b) {
+; CHECK-LABEL: @PR36270(
+; CHECK-NEXT:    unreachable
+;
+  %B7 = xor i32 -1, 2147483647
+  %and1 = and i32 %b, 3
+  %B12 = lshr i32 %B7, %and1
+  %C1 = icmp ult i32 %and1, %B12
+  tail call void @llvm.assume(i1 %C1)
+  %cmp2 = icmp eq i32 0, %B12
+  tail call void @llvm.assume(i1 %cmp2)
+  unreachable
+}
+
+; PR47416
+
+define i32 @unreachable_assume(i32 %x, i32 %y) {
+; CHECK-LABEL: @unreachable_assume(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp sgt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP0]], [[CMP1]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[OR]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    br i1 [[CMP2]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[Y]], -2
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A]], 104
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP3]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp0 = icmp sgt i32 %x, 1
+  %cmp1 = icmp eq i32 %y, 1
+  %or = or i1 %cmp0, %cmp1
+  tail call void @llvm.assume(i1 %or)
+  %cmp2 = icmp eq i32 %x, 1
+  br i1 %cmp2, label %if, label %exit
+
+if:
+  %a = and i32 %y, -2
+  %cmp3 = icmp ne i32 %a, 104
+  tail call void @llvm.assume(i1 %cmp3)
+  br label %exit
+
+exit:
+  %cmp4 = icmp eq i32 %x, 2
+  tail call void @llvm.assume(i1 %cmp4)
+  unreachable
+}
+
+define i32 @unreachable_assumes_and_store(i32 %x, i32 %y, i32* %p) {
+; CHECK-LABEL: @unreachable_assumes_and_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp sgt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP0]], [[CMP1]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[OR]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    br i1 [[CMP2]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[Y]], -2
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A]], 104
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP3]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp0 = icmp sgt i32 %x, 1
+  %cmp1 = icmp eq i32 %y, 1
+  %or = or i1 %cmp0, %cmp1
+  tail call void @llvm.assume(i1 %or)
+  %cmp2 = icmp eq i32 %x, 1
+  br i1 %cmp2, label %if, label %exit
+
+if:
+  %a = and i32 %y, -2
+  %cmp3 = icmp ne i32 %a, 104
+  tail call void @llvm.assume(i1 %cmp3)
+  br label %exit
+
+exit:
+  %cmp4 = icmp eq i32 %x, 2
+  tail call void @llvm.assume(i1 %cmp4)
+  %cmp5 = icmp ugt i32 %y, 42
+  tail call void @llvm.assume(i1 %cmp5)
+  store i32 %x, i32* %p
+  unreachable
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "Me", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: null, retainedTypes: null, imports: null)
+!1 = !DILocalVariable(name: "", arg: 1, scope: !2, file: null, line: 1, type: null)
+!2 = distinct !DISubprogram(name: "debug", linkageName: "debug", scope: null, file: null, line: 0, type: null, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!3 = !DIFile(filename: "consecutive-fences.ll", directory: "")
+!5 = !{i32 2, !"Dwarf Version", i32 4}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 7, !"PIC Level", i32 2}
+!9 = !DILocation(line: 0, column: 0, scope: !2)
+
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff  --git a/llvm/test/Transforms/InstCombine/bswap-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bswap-inseltpoison.ll
new file mode 100644
index 000000000000..3730496ba6e3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/bswap-inseltpoison.ll
@@ -0,0 +1,867 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+define i32 @test1(i32 %i) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[T12:%.*]] = call i32 @llvm.bswap.i32(i32 [[I:%.*]])
+; CHECK-NEXT:    ret i32 [[T12]]
+;
+  %t1 = lshr i32 %i, 24
+  %t3 = lshr i32 %i, 8
+  %t4 = and i32 %t3, 65280
+  %t5 = or i32 %t1, %t4
+  %t7 = shl i32 %i, 8
+  %t8 = and i32 %t7, 16711680
+  %t9 = or i32 %t5, %t8
+  %t11 = shl i32 %i, 24
+  %t12 = or i32 %t9, %t11
+  ret i32 %t12
+}
+
+define <2 x i32> @test1_vector(<2 x i32> %i) {
+; CHECK-LABEL: @test1_vector(
+; CHECK-NEXT:    [[T12:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[I:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[T12]]
+;
+  %t1 = lshr <2 x i32> %i, <i32 24, i32 24>
+  %t3 = lshr <2 x i32> %i, <i32 8, i32 8>
+  %t4 = and <2 x i32> %t3, <i32 65280, i32 65280>
+  %t5 = or <2 x i32> %t1, %t4
+  %t7 = shl <2 x i32> %i, <i32 8, i32 8>
+  %t8 = and <2 x i32> %t7, <i32 16711680, i32 16711680>
+  %t9 = or <2 x i32> %t5, %t8
+  %t11 = shl <2 x i32> %i, <i32 24, i32 24>
+  %t12 = or <2 x i32> %t9, %t11
+  ret <2 x i32> %t12
+}
+
+define i32 @test2(i32 %arg) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[T14:%.*]] = call i32 @llvm.bswap.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[T14]]
+;
+  %t2 = shl i32 %arg, 24
+  %t4 = shl i32 %arg, 8
+  %t5 = and i32 %t4, 16711680
+  %t6 = or i32 %t2, %t5
+  %t8 = lshr i32 %arg, 8
+  %t9 = and i32 %t8, 65280
+  %t10 = or i32 %t6, %t9
+  %t12 = lshr i32 %arg, 24
+  %t14 = or i32 %t10, %t12
+  ret i32 %t14
+}
+
+define <2 x i32> @test2_vector(<2 x i32> %arg) {
+; CHECK-LABEL: @test2_vector(
+; CHECK-NEXT:    [[T14:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[ARG:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[T14]]
+;
+  %t2 = shl <2 x i32> %arg, <i32 24, i32 24>
+  %t4 = shl <2 x i32> %arg, <i32 8, i32 8>
+  %t5 = and <2 x i32> %t4, <i32 16711680, i32 16711680>
+  %t6 = or <2 x i32> %t2, %t5
+  %t8 = lshr <2 x i32> %arg, <i32 8, i32 8>
+  %t9 = and <2 x i32> %t8, <i32 65280, i32 65280>
+  %t10 = or <2 x i32> %t6, %t9
+  %t12 = lshr <2 x i32> %arg, <i32 24, i32 24>
+  %t14 = or <2 x i32> %t10, %t12
+  ret <2 x i32> %t14
+}
+
+define <2 x i32> @test2_vector_undef(<2 x i32> %arg) {
+; CHECK-LABEL: @test2_vector_undef(
+; CHECK-NEXT:    [[T2:%.*]] = shl <2 x i32> [[ARG:%.*]], <i32 24, i32 undef>
+; CHECK-NEXT:    [[T4:%.*]] = shl <2 x i32> [[ARG]], <i32 8, i32 8>
+; CHECK-NEXT:    [[T5:%.*]] = and <2 x i32> [[T4]], <i32 16711680, i32 undef>
+; CHECK-NEXT:    [[T6:%.*]] = or <2 x i32> [[T2]], [[T5]]
+; CHECK-NEXT:    [[T8:%.*]] = lshr <2 x i32> [[ARG]], <i32 8, i32 8>
+; CHECK-NEXT:    [[T9:%.*]] = and <2 x i32> [[T8]], <i32 65280, i32 undef>
+; CHECK-NEXT:    [[T10:%.*]] = or <2 x i32> [[T6]], [[T9]]
+; CHECK-NEXT:    [[T12:%.*]] = lshr <2 x i32> [[ARG]], <i32 24, i32 undef>
+; CHECK-NEXT:    [[T14:%.*]] = or <2 x i32> [[T10]], [[T12]]
+; CHECK-NEXT:    ret <2 x i32> [[T14]]
+;
+  %t2 = shl <2 x i32> %arg, <i32 24, i32 undef>
+  %t4 = shl <2 x i32> %arg, <i32 8, i32 8>
+  %t5 = and <2 x i32> %t4, <i32 16711680, i32 undef>
+  %t6 = or <2 x i32> %t2, %t5
+  %t8 = lshr <2 x i32> %arg, <i32 8, i32 8>
+  %t9 = and <2 x i32> %t8, <i32 65280, i32 undef>
+  %t10 = or <2 x i32> %t6, %t9
+  %t12 = lshr <2 x i32> %arg, <i32 24, i32 undef>
+  %t14 = or <2 x i32> %t10, %t12
+  ret <2 x i32> %t14
+}
+
+define i16 @test3(i16 %s) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[T5:%.*]] = call i16 @llvm.bswap.i16(i16 [[S:%.*]])
+; CHECK-NEXT:    ret i16 [[T5]]
+;
+  %t2 = lshr i16 %s, 8
+  %t4 = shl i16 %s, 8
+  %t5 = or i16 %t2, %t4
+  ret i16 %t5
+}
+
+define <2 x i16> @test3_vector(<2 x i16> %s) {
+; CHECK-LABEL: @test3_vector(
+; CHECK-NEXT:    [[T5:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[S:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[T5]]
+;
+  %t2 = lshr <2 x i16> %s, <i16 8, i16 8>
+  %t4 = shl <2 x i16> %s, <i16 8, i16 8>
+  %t5 = or <2 x i16> %t2, %t4
+  ret <2 x i16> %t5
+}
+
+define <2 x i16> @test3_vector_undef(<2 x i16> %s) {
+; CHECK-LABEL: @test3_vector_undef(
+; CHECK-NEXT:    [[T5:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[S:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[T5]]
+;
+  %t2 = lshr <2 x i16> %s, <i16 undef, i16 8>
+  %t4 = shl <2 x i16> %s, <i16 8, i16 undef>
+  %t5 = or <2 x i16> %t2, %t4
+  ret <2 x i16> %t5
+}
+
+define i16 @test4(i16 %s) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[T5:%.*]] = call i16 @llvm.bswap.i16(i16 [[S:%.*]])
+; CHECK-NEXT:    ret i16 [[T5]]
+;
+  %t2 = lshr i16 %s, 8
+  %t4 = shl i16 %s, 8
+  %t5 = or i16 %t4, %t2
+  ret i16 %t5
+}
+
+define <2 x i16> @test4_vector(<2 x i16> %s) {
+; CHECK-LABEL: @test4_vector(
+; CHECK-NEXT:    [[T5:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[S:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[T5]]
+;
+  %t2 = lshr <2 x i16> %s, <i16 8, i16 8>
+  %t4 = shl <2 x i16> %s, <i16 8, i16 8>
+  %t5 = or <2 x i16> %t4, %t2
+  ret <2 x i16> %t5
+}
+
+define i16 @test5(i16 %a) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[T_UPGRD_3:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
+; CHECK-NEXT:    ret i16 [[T_UPGRD_3]]
+;
+  %t = zext i16 %a to i32
+  %t1 = and i32 %t, 65280
+  %t2 = ashr i32 %t1, 8
+  %t2.upgrd.1 = trunc i32 %t2 to i16
+  %t4 = and i32 %t, 255
+  %t5 = shl i32 %t4, 8
+  %t5.upgrd.2 = trunc i32 %t5 to i16
+  %t.upgrd.3 = or i16 %t2.upgrd.1, %t5.upgrd.2
+  %t6 = bitcast i16 %t.upgrd.3 to i16
+  %t6.upgrd.4 = zext i16 %t6 to i32
+  %retval = trunc i32 %t6.upgrd.4 to i16
+  ret i16 %retval
+}
+
+define <2 x i16> @test5_vector(<2 x i16> %a) {
+; CHECK-LABEL: @test5_vector(
+; CHECK-NEXT:    [[T_UPGRD_3:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i16> [[T_UPGRD_3]]
+;
+  %t = zext <2 x i16> %a to <2 x i32>
+  %t1 = and <2 x i32> %t, <i32 65280, i32 65280>
+  %t2 = ashr <2 x i32> %t1, <i32 8, i32 8>
+  %t2.upgrd.1 = trunc <2 x i32> %t2 to <2 x i16>
+  %t4 = and <2 x i32> %t, <i32 255, i32 255>
+  %t5 = shl <2 x i32> %t4, <i32 8, i32 8>
+  %t5.upgrd.2 = trunc <2 x i32> %t5 to <2 x i16>
+  %t.upgrd.3 = or <2 x i16> %t2.upgrd.1, %t5.upgrd.2
+  %t6 = bitcast <2 x i16> %t.upgrd.3 to <2 x i16>
+  %t6.upgrd.4 = zext <2 x i16> %t6 to <2 x i32>
+  %retval = trunc <2 x i32> %t6.upgrd.4 to <2 x i16>
+  ret <2 x i16> %retval
+}
+
+; PR2842
+define i32 @test6(i32 %x) nounwind readnone {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[T7:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[T7]]
+;
+  %t = shl i32 %x, 16
+  %x.mask = and i32 %x, 65280
+  %t1 = lshr i32 %x, 16
+  %t2 = and i32 %t1, 255
+  %t3 = or i32 %x.mask, %t
+  %t4 = or i32 %t3, %t2
+  %t5 = shl i32 %t4, 8
+  %t6 = lshr i32 %x, 24
+  %t7 = or i32 %t5, %t6
+  ret i32 %t7
+}
+
+define <2 x i32> @test6_vector(<2 x i32> %x) nounwind readnone {
+; CHECK-LABEL: @test6_vector(
+; CHECK-NEXT:    [[T7:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[T7]]
+;
+  %t = shl <2 x i32> %x, <i32 16, i32 16>
+  %x.mask = and <2 x i32> %x, <i32 65280, i32 65280>
+  %t1 = lshr <2 x i32> %x, <i32 16, i32 16>
+  %t2 = and <2 x i32> %t1, <i32 255, i32 255>
+  %t3 = or <2 x i32> %x.mask, %t
+  %t4 = or <2 x i32> %t3, %t2
+  %t5 = shl <2 x i32> %t4, <i32 8, i32 8>
+  %t6 = lshr <2 x i32> %x, <i32 24, i32 24>
+  %t7 = or <2 x i32> %t5, %t6
+  ret <2 x i32> %t7
+}
+
+declare void @extra_use(i32)
+
+; swaphalf = (x << 16 | x >> 16)
+; ((swaphalf & 0x00ff00ff) << 8) | ((swaphalf >> 8) & 0x00ff00ff)
+
+define i32 @bswap32_and_first(i32 %x) {
+; CHECK-LABEL: @bswap32_and_first(
+; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[BSWAP]]
+;
+  %shl = shl i32 %x, 16
+  %shr = lshr i32 %x, 16
+  %swaphalf = or i32 %shl, %shr
+  %t = and i32 %swaphalf, 16711935
+  %tshl = shl nuw i32 %t, 8
+  %b = lshr i32 %swaphalf, 8
+  %band = and i32 %b, 16711935
+  %bswap = or i32 %tshl, %band
+  ret i32 %bswap
+}
+
+; Extra use should not prevent matching to bswap.
+; swaphalf = (x << 16 | x >> 16)
+; ((swaphalf & 0x00ff00ff) << 8) | ((swaphalf >> 8) & 0x00ff00ff)
+
+define i32 @bswap32_and_first_extra_use(i32 %x) {
+; CHECK-LABEL: @bswap32_and_first_extra_use(
+; CHECK-NEXT:    [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16)
+; CHECK-NEXT:    [[T:%.*]] = and i32 [[SWAPHALF]], 16711935
+; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
+; CHECK-NEXT:    call void @extra_use(i32 [[T]])
+; CHECK-NEXT:    ret i32 [[BSWAP]]
+;
+  %shl = shl i32 %x, 16
+  %shr = lshr i32 %x, 16
+  %swaphalf = or i32 %shl, %shr
+  %t = and i32 %swaphalf, 16711935
+  %tshl = shl nuw i32 %t, 8
+  %b = lshr i32 %swaphalf, 8
+  %band = and i32 %b, 16711935
+  %bswap = or i32 %tshl, %band
+  call void @extra_use(i32 %t)
+  ret i32 %bswap
+}
+
+; swaphalf = (x << 16 | x >> 16)
+; ((swaphalf << 8) & 0xff00ff00) | ((swaphalf >> 8) & 0x00ff00ff)
+
+; PR23863
+define i32 @bswap32_shl_first(i32 %x) {
+; CHECK-LABEL: @bswap32_shl_first(
+; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[BSWAP]]
+;
+  %shl = shl i32 %x, 16
+  %shr = lshr i32 %x, 16
+  %swaphalf = or i32 %shl, %shr
+  %t = shl i32 %swaphalf, 8
+  %tand = and i32 %t, -16711936
+  %b = lshr i32 %swaphalf, 8
+  %band = and i32 %b, 16711935
+  %bswap = or i32 %tand, %band
+  ret i32 %bswap
+}
+
+; Extra use should not prevent matching to bswap.
+; swaphalf = (x << 16 | x >> 16)
+; ((swaphalf << 8) & 0xff00ff00) | ((swaphalf >> 8) & 0x00ff00ff)
+
+define i32 @bswap32_shl_first_extra_use(i32 %x) {
+; CHECK-LABEL: @bswap32_shl_first_extra_use(
+; CHECK-NEXT:    [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16)
+; CHECK-NEXT:    [[T:%.*]] = shl i32 [[SWAPHALF]], 8
+; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
+; CHECK-NEXT:    call void @extra_use(i32 [[T]])
+; CHECK-NEXT:    ret i32 [[BSWAP]]
+;
+  %shl = shl i32 %x, 16
+  %shr = lshr i32 %x, 16
+  %swaphalf = or i32 %shl, %shr
+  %t = shl i32 %swaphalf, 8
+  %tand = and i32 %t, -16711936
+  %b = lshr i32 %swaphalf, 8
+  %band = and i32 %b, 16711935
+  %bswap = or i32 %tand, %band
+  call void @extra_use(i32 %t)
+  ret i32 %bswap
+}
+
+define i16 @test8(i16 %a) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[OR:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
+; CHECK-NEXT:    ret i16 [[OR]]
+;
+  %conv = zext i16 %a to i32
+  %shr = lshr i16 %a, 8
+  %shl = shl i32 %conv, 8
+  %conv1 = zext i16 %shr to i32
+  %or = or i32 %conv1, %shl
+  %conv2 = trunc i32 %or to i16
+  ret i16 %conv2
+}
+
+define i16 @test9(i16 %a) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[OR:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
+; CHECK-NEXT:    ret i16 [[OR]]
+;
+  %conv = zext i16 %a to i32
+  %shr = lshr i32 %conv, 8
+  %shl = shl i32 %conv, 8
+  %or = or i32 %shr, %shl
+  %conv2 = trunc i32 %or to i16
+  ret i16 %conv2
+}
+
+define i16 @test10(i32 %a) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]])
+; CHECK-NEXT:    ret i16 [[REV]]
+;
+  %shr1 = lshr i32 %a, 8
+  %and1 = and i32 %shr1, 255
+  %and2 = shl i32 %a, 8
+  %shl1 = and i32 %and2, 65280
+  %or = or i32 %and1, %shl1
+  %conv = trunc i32 %or to i16
+  ret i16 %conv
+}
+
+define <2 x i16> @test10_vector(<2 x i32> %a) {
+; CHECK-LABEL: @test10_vector(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i32> [[A:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[REV:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[TRUNC]])
+; CHECK-NEXT:    ret <2 x i16> [[REV]]
+;
+  %shr1 = lshr <2 x i32> %a, <i32 8, i32 8>
+  %and1 = and <2 x i32> %shr1, <i32 255, i32 255>
+  %and2 = shl <2 x i32> %a, <i32 8, i32 8>
+  %shl1 = and <2 x i32> %and2, <i32 65280, i32 65280>
+  %or = or <2 x i32> %and1, %shl1
+  %conv = trunc <2 x i32> %or to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define i64 @PR39793_bswap_u64_as_u32(i64 %0) {
+; CHECK-LABEL: @PR39793_bswap_u64_as_u32(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[REV:%.*]] = call i32 @llvm.bswap.i32(i32 [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[REV]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 24
+  %3 = and i64 %2, 255
+  %4 = lshr i64 %0, 8
+  %5 = and i64 %4, 65280
+  %6 = or i64 %3, %5
+  %7 = shl i64 %0, 8
+  %8 = and i64 %7, 16711680
+  %9 = or i64 %6, %8
+  %10 = shl i64 %0, 24
+  %11 = and i64 %10, 4278190080
+  %12 = or i64 %9, %11
+  ret i64 %12
+}
+
+define i16 @PR39793_bswap_u64_as_u32_trunc(i64 %0) {
+; CHECK-LABEL: @PR39793_bswap_u64_as_u32_trunc(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[REV:%.*]] = call i32 @llvm.bswap.i32(i32 [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[REV]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %2 = lshr i64 %0, 24
+  %3 = and i64 %2, 255
+  %4 = lshr i64 %0, 8
+  %5 = and i64 %4, 65280
+  %6 = or i64 %3, %5
+  %7 = shl i64 %0, 8
+  %8 = and i64 %7, 16711680
+  %9 = or i64 %6, %8
+  %10 = shl i64 %0, 24
+  %11 = and i64 %10, 4278190080
+  %12 = or i64 %9, %11
+  %13 = trunc i64 %12 to i16
+  ret i16 %13
+}
+
+define i64 @PR39793_bswap_u64_as_u16(i64 %0) {
+; CHECK-LABEL: @PR39793_bswap_u64_as_u16(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i16
+; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[REV]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 8
+  %3 = and i64 %2, 255
+  %4 = shl i64 %0, 8
+  %5 = and i64 %4, 65280
+  %6 = or i64 %3, %5
+  ret i64 %6
+}
+
+define <2 x i64> @PR39793_bswap_u64_as_u16_vector(<2 x i64> %0) {
+; CHECK-LABEL: @PR39793_bswap_u64_as_u16_vector(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> [[TMP0:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[REV:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[REV]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %2 = lshr <2 x i64> %0, <i64 8, i64 8>
+  %3 = and <2 x i64> %2, <i64 255, i64 255>
+  %4 = shl <2 x i64> %0, <i64 8, i64 8>
+  %5 = and <2 x i64> %4, <i64 65280, i64 65280>
+  %6 = or <2 x i64> %3, %5
+  ret <2 x i64> %6
+}
+
+define i8 @PR39793_bswap_u64_as_u16_trunc(i64 %0) {
+; CHECK-LABEL: @PR39793_bswap_u64_as_u16_trunc(
+; CHECK-NEXT:    [[REV1:%.*]] = lshr i64 [[TMP0:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[REV1]] to i8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %2 = lshr i64 %0, 8
+  %3 = and i64 %2, 255
+  %4 = shl i64 %0, 8
+  %5 = and i64 %4, 65280
+  %6 = or i64 %3, %5
+  %7 = trunc i64 %6 to i8
+  ret i8 %7
+}
+
+define i50 @PR39793_bswap_u50_as_u16(i50 %0) {
+; CHECK-LABEL: @PR39793_bswap_u50_as_u16(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i50 [[TMP0:%.*]] to i16
+; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[REV]] to i50
+; CHECK-NEXT:    ret i50 [[TMP2]]
+;
+  %2 = lshr i50 %0, 8
+  %3 = and i50 %2, 255
+  %4 = shl i50 %0, 8
+  %5 = and i50 %4, 65280
+  %6 = or i50 %3, %5
+  ret i50 %6
+}
+
+define i32 @PR39793_bswap_u32_as_u16(i32 %0) {
+; CHECK-LABEL: @PR39793_bswap_u32_as_u16(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[TMP0:%.*]] to i16
+; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[REV]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = lshr i32 %0, 8
+  %3 = and i32 %2, 255
+  %4 = shl i32 %0, 8
+  %5 = and i32 %4, 65280
+  %6 = or i32 %3, %5
+  ret i32 %6
+}
+
+define i8 @PR39793_bswap_u32_as_u16_trunc(i32 %0) {
+; CHECK-LABEL: @PR39793_bswap_u32_as_u16_trunc(
+; CHECK-NEXT:    [[REV1:%.*]] = lshr i32 [[TMP0:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[REV1]] to i8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %2 = lshr i32 %0, 8
+  %3 = and i32 %2, 255
+  %4 = shl i32 %0, 8
+  %5 = and i32 %4, 65280
+  %6 = or i32 %3, %5
+  %7 = trunc i32 %6 to i8
+  ret i8 %7
+}
+
+define i32 @partial_bswap(i32 %x) {
+; CHECK-LABEL: @partial_bswap(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %x3 = shl i32 %x, 24
+  %a2 = shl i32 %x, 8
+  %x2 = and i32 %a2, 16711680
+  %x32 = or i32 %x3, %x2
+  %t1 = and i32 %x, -65536
+  %t2 = call i32 @llvm.bswap.i32(i32 %t1)
+  %r = or i32 %x32, %t2
+  ret i32 %r
+}
+declare i32 @llvm.bswap.i32(i32)
+
+define <2 x i32> @partial_bswap_vector(<2 x i32> %x) {
+; CHECK-LABEL: @partial_bswap_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %x3 = shl <2 x i32> %x, <i32 24, i32 24>
+  %a2 = shl <2 x i32> %x, <i32 8, i32 8>
+  %x2 = and <2 x i32> %a2, <i32 16711680, i32 16711680>
+  %x32 = or <2 x i32> %x3, %x2
+  %t1 = and <2 x i32> %x, <i32 -65536, i32 -65536>
+  %t2 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %t1)
+  %r = or <2 x i32> %x32, %t2
+  ret <2 x i32> %r
+}
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
+
+define i16 @partial_bitreverse(i16 %x) {
+; CHECK-LABEL: @partial_bitreverse(
+; CHECK-NEXT:    [[OR:%.*]] = call i16 @llvm.bswap.i16(i16 [[X:%.*]])
+; CHECK-NEXT:    ret i16 [[OR]]
+;
+  %rev= call i16 @llvm.bitreverse.i16(i16 %x)
+  %lo = and i16 %rev, 255
+  %hi = and i16 %rev, -256
+  %revlo = call i16 @llvm.bitreverse.i16(i16 %lo)
+  %revhi = call i16 @llvm.bitreverse.i16(i16 %hi)
+  %newlo = lshr i16 %revlo, 8
+  %newhi = shl  i16 %revhi, 8
+  %or = or i16 %newlo, %newhi
+  ret i16 %or
+}
+declare i16 @llvm.bitreverse.i16(i16)
+
+define i64 @bswap_and_mask_0(i64 %0) {
+; CHECK-LABEL: @bswap_and_mask_0(
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP0:%.*]], -72057594037927681
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = shl i64 %0, 56
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+
+define i64 @bswap_and_mask_1(i64 %0) {
+; CHECK-LABEL: @bswap_and_mask_1(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0:%.*]], 56
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP0]], 40
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], 65280
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = lshr i64 %0, 40
+  %4 = and i64 %3, 65280
+  %5 = or i64 %4, %2
+  ret i64 %5
+}
+
+define i64 @bswap_and_mask_2(i64 %0) {
+; CHECK-LABEL: @bswap_and_mask_2(
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP0:%.*]], -72057594037862401
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = shl i64 %0, 56
+  %4 = or i64 %2, %3
+  %5 = shl i64 %0, 40
+  %6 = and i64 %5, 71776119061217280
+  %7 = or i64 %4, %6
+  ret i64 %7
+}
+
+define i64 @bswap_trunc(i64 %x01234567) {
+; CHECK-LABEL: @bswap_trunc(
+; CHECK-NEXT:    [[X7ZZZZZZZ:%.*]] = shl i64 [[X01234567:%.*]], 56
+; CHECK-NEXT:    [[XZ0123456:%.*]] = lshr i64 [[X01234567]], 8
+; CHECK-NEXT:    [[XZZZZZ012:%.*]] = lshr i64 [[X01234567]], 40
+; CHECK-NEXT:    [[X3456:%.*]] = trunc i64 [[XZ0123456]] to i32
+; CHECK-NEXT:    [[XZ012:%.*]] = trunc i64 [[XZZZZZ012]] to i32
+; CHECK-NEXT:    [[X6543:%.*]] = call i32 @llvm.bswap.i32(i32 [[X3456]])
+; CHECK-NEXT:    [[X210Z:%.*]] = call i32 @llvm.bswap.i32(i32 [[XZ012]])
+; CHECK-NEXT:    [[XZ210:%.*]] = lshr exact i32 [[X210Z]], 8
+; CHECK-NEXT:    [[XZZZZ6543:%.*]] = zext i32 [[X6543]] to i64
+; CHECK-NEXT:    [[XZZZZZ210:%.*]] = zext i32 [[XZ210]] to i64
+; CHECK-NEXT:    [[XZ6543ZZZ:%.*]] = shl nuw nsw i64 [[XZZZZ6543]], 24
+; CHECK-NEXT:    [[XZ6543210:%.*]] = or i64 [[XZ6543ZZZ]], [[XZZZZZ210]]
+; CHECK-NEXT:    [[X76543210:%.*]] = or i64 [[XZ6543210]], [[X7ZZZZZZZ]]
+; CHECK-NEXT:    ret i64 [[X76543210]]
+;
+  %x7zzzzzzz = shl i64 %x01234567, 56
+  %xz0123456 = lshr i64 %x01234567, 8
+  %xzzzzz012 = lshr i64 %x01234567, 40
+  %x3456 = trunc i64 %xz0123456 to i32
+  %xz012 = trunc i64 %xzzzzz012 to i32
+  %x6543 = call i32 @llvm.bswap.i32(i32 %x3456)
+  %x210z = call i32 @llvm.bswap.i32(i32 %xz012)
+  %xz210 = lshr i32 %x210z, 8
+  %xzzzz6543 = zext i32 %x6543 to i64
+  %xzzzzz210 = zext i32 %xz210 to i64
+  %xz6543zzz = shl i64 %xzzzz6543, 24
+  %xz6543210 = or i64 %xzzzzz210, %xz6543zzz
+  %x76543210 = or i64 %xz6543210, %x7zzzzzzz
+  ret i64 %x76543210
+}
+
+define i32 @shuf_4bytes(<4 x i8> %x) {
+; CHECK-LABEL: @shuf_4bytes(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[X:%.*]] to i32
+; CHECK-NEXT:    [[CAST:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[CAST]]
+;
+  %bswap = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %cast = bitcast <4 x i8> %bswap to i32
+  ret i32 %cast
+}
+
+define i32 @shuf_load_4bytes(<4 x i8>* %p) {
+; CHECK-LABEL: @shuf_load_4bytes(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* [[P:%.*]] to i32*
+; CHECK-NEXT:    [[X1:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[CAST:%.*]] = call i32 @llvm.bswap.i32(i32 [[X1]])
+; CHECK-NEXT:    ret i32 [[CAST]]
+;
+  %x = load <4 x i8>, <4 x i8>* %p
+  %bswap = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 undef, i32 0>
+  %cast = bitcast <4 x i8> %bswap to i32
+  ret i32 %cast
+}
+
+define i32 @shuf_bitcast_twice_4bytes(i32 %x) {
+; CHECK-LABEL: @shuf_bitcast_twice_4bytes(
+; CHECK-NEXT:    [[CAST2:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CAST2]]
+;
+  %cast1 = bitcast i32 %x to <4 x i8>
+  %bswap = shufflevector <4 x i8> %cast1, <4 x i8> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+  %cast2 = bitcast <4 x i8> %bswap to i32
+  ret i32 %cast2
+}
+
+; Negative test - extra use
+declare void @use(<4 x i8>)
+
+define i32 @shuf_4bytes_extra_use(<4 x i8> %x) {
+; CHECK-LABEL: @shuf_4bytes_extra_use(
+; CHECK-NEXT:    [[BSWAP:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @use(<4 x i8> [[BSWAP]])
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[BSWAP]] to i32
+; CHECK-NEXT:    ret i32 [[CAST]]
+;
+  %bswap = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  call void @use(<4 x i8> %bswap)
+  %cast = bitcast <4 x i8> %bswap to i32
+  ret i32 %cast
+}
+
+; Negative test - scalar type is not in the data layout
+
+define i128 @shuf_16bytes(<16 x i8> %x) {
+; CHECK-LABEL: @shuf_16bytes(
+; CHECK-NEXT:    [[BSWAP:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <16 x i8> [[BSWAP]] to i128
+; CHECK-NEXT:    ret i128 [[CAST]]
+;
+  %bswap = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %cast = bitcast <16 x i8> %bswap to i128
+  ret i128 %cast
+}
+
+; Negative test - don't touch widening shuffles (for now)
+
+define i32 @shuf_2bytes_widening(<2 x i8> %x) {
+; CHECK-LABEL: @shuf_2bytes_widening(
+; CHECK-NEXT:    [[BSWAP:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[BSWAP]] to i32
+; CHECK-NEXT:    ret i32 [[CAST]]
+;
+  %bswap = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+  %cast = bitcast <4 x i8> %bswap to i32
+  ret i32 %cast
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define i32 @funnel_unary(i32 %abcd) {
+; CHECK-LABEL: @funnel_unary(
+; CHECK-NEXT:    [[DCBA:%.*]] = call i32 @llvm.bswap.i32(i32 [[ABCD:%.*]])
+; CHECK-NEXT:    ret i32 [[DCBA]]
+;
+  %dabc = call i32 @llvm.fshl.i32(i32 %abcd, i32 %abcd, i32 24)
+  %bcda = call i32 @llvm.fshr.i32(i32 %abcd, i32 %abcd, i32 24)
+  %dzbz = and i32 %dabc, -16711936
+  %zcza = and i32 %bcda,  16711935
+  %dcba = or i32 %dzbz, %zcza
+  ret i32 %dcba
+}
+
+define i32 @funnel_binary(i32 %abcd) {
+; CHECK-LABEL: @funnel_binary(
+; CHECK-NEXT:    [[DCBA:%.*]] = call i32 @llvm.bswap.i32(i32 [[ABCD:%.*]])
+; CHECK-NEXT:    ret i32 [[DCBA]]
+;
+  %cdzz = shl i32 %abcd, 16
+  %dcdz = call i32 @llvm.fshl.i32(i32 %abcd, i32 %cdzz, i32 24)
+  %zzab = lshr i32 %abcd, 16
+  %zaba = call i32 @llvm.fshr.i32(i32 %zzab, i32 %abcd, i32 24)
+  %dczz = and i32 %dcdz, -65536
+  %zzba = and i32 %zaba,  65535
+  %dcba = or i32 %dczz, %zzba
+  ret i32 %dcba
+}
+
+define i32 @funnel_and(i32 %abcd) {
+; CHECK-LABEL: @funnel_and(
+; CHECK-NEXT:    [[DCBA:%.*]] = call i32 @llvm.bswap.i32(i32 [[ABCD:%.*]])
+; CHECK-NEXT:    ret i32 [[DCBA]]
+;
+  %zzcz = and i32 %abcd, 65280
+  %zcza = call i32 @llvm.fshl.i32(i32 %zzcz, i32 %abcd, i32 8)
+  %zbzz = and i32 %abcd, 16711680
+  %dzbz = call i32 @llvm.fshl.i32(i32 %abcd, i32 %zbzz, i32 24)
+  %dcba = or i32 %zcza, %dzbz
+  ret i32 %dcba
+}
+
+; PR47191 - deep IR trees prevent ADD/XOR instructions being simplified to OR.
+
+define i64 @PR47191_problem1(i64 %0) {
+; CHECK-LABEL: @PR47191_problem1(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP0:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = lshr i64 %0, 40
+  %4 = and i64 %3, 65280
+  %5 = lshr i64 %0, 24
+  %6 = and i64 %5, 16711680
+  %7 = lshr i64 %0, 8
+  %8 = and i64 %7, 4278190080
+  %9 = shl i64 %0, 56
+  %10 = shl i64 %0, 40
+  %11 = and i64 %10, 71776119061217280
+  %12 = shl i64 %0, 24
+  %13 = and i64 %12, 280375465082880
+  %14 = or i64 %9, %2
+  %15 = or i64 %14, %4
+  %16 = or i64 %15, %6
+  %17 = or i64 %16, %8
+  %18 = or i64 %17, %11
+  %19 = or i64 %18, %13
+  %20 = shl i64 %0, 8
+  %21 = and i64 %20, 1095216660480
+  %22 = add i64 %19, %21
+  ret i64 %22
+}
+
+define i64 @PR47191_problem2(i64 %0) {
+; CHECK-LABEL: @PR47191_problem2(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP0:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = lshr i64 %0, 40
+  %4 = and i64 %3, 65280
+  %5 = lshr i64 %0, 24
+  %6 = and i64 %5, 16711680
+  %7 = lshr i64 %0, 8
+  %8 = and i64 %7, 4278190080
+  %9 = shl i64 %0, 56
+  %10 = shl i64 %0, 40
+  %11 = and i64 %10, 71776119061217280
+  %12 = or i64 %9, %2
+  %13 = or i64 %12, %4
+  %14 = or i64 %13, %6
+  %15 = or i64 %14, %8
+  %16 = or i64 %15, %11
+  %17 = shl i64 %0, 24
+  %18 = and i64 %17, 280375465082880
+  %19 = shl i64 %0, 8
+  %20 = and i64 %19, 1095216660480
+  %21 = or i64 %20, %18
+  %22 = xor i64 %21, %16
+  ret i64 %22
+}
+
+define i64 @PR47191_problem3(i64 %0) {
+; CHECK-LABEL: @PR47191_problem3(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP0:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = lshr i64 %0, 40
+  %4 = and i64 %3, 65280
+  %5 = lshr i64 %0, 24
+  %6 = and i64 %5, 16711680
+  %7 = lshr i64 %0, 8
+  %8 = and i64 %7, 4278190080
+  %9 = shl i64 %0, 56
+  %10 = shl i64 %0, 40
+  %11 = and i64 %10, 71776119061217280
+  %12 = or i64 %9, %2
+  %13 = or i64 %12, %4
+  %14 = or i64 %13, %6
+  %15 = or i64 %14, %8
+  %16 = or i64 %15, %11
+  %17 = shl i64 %0, 24
+  %18 = and i64 %17, 280375465082880
+  %19 = shl i64 %0, 8
+  %20 = and i64 %19, 1095216660480
+  %21 = or i64 %20, %18
+  %22 = xor i64 %21, %16
+  ret i64 %22
+}
+
+define i64 @PR47191_problem4(i64 %0) {
+; CHECK-LABEL: @PR47191_problem4(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP0:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = lshr i64 %0, 56
+  %3 = shl i64 %0, 56
+  %4 = or i64 %2, %3
+  %5 = lshr i64 %0, 40
+  %6 = and i64 %5, 65280
+  %7 = or i64 %4, %6
+  %8 = shl i64 %0, 40
+  %9 = and i64 %8, 71776119061217280
+  %10 = or i64 %7, %9
+  %11 = lshr i64 %0, 24
+  %12 = and i64 %11, 16711680
+  %13 = or i64 %10, %12
+  %14 = shl i64 %0, 24
+  %15 = and i64 %14, 280375465082880
+  %16 = or i64 %13, %15
+  %17 = lshr i64 %0, 8
+  %18 = and i64 %17, 4278190080
+  %19 = or i64 %16, %18
+  %20 = shl i64 %0, 8
+  %21 = and i64 %20, 1095216660480
+  %22 = add i64 %19, %21
+  ret i64 %22
+}

diff  --git a/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
index d614bb06879e..82c1a5c98294 100644
--- a/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
@@ -35,7 +35,7 @@ define i64 @test2(i64 %in) {
 ; ANY-NEXT:    ret i64 [[IN:%.*]]
 ;
   %vec = insertelement <8 x i64> poison, i64 %in, i32 0
-  %splat = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> zeroinitializer
+  %splat = shufflevector <8 x i64> %vec, <8 x i64> poison, <8 x i32> zeroinitializer
   %add = add <8 x i64> %splat, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %r = extractelement <8 x i64> %add, i32 0
   ret i64 %r

diff  --git a/llvm/test/Transforms/InstCombine/fmul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/fmul-inseltpoison.ll
new file mode 100644
index 000000000000..083b7a6ad6ad
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fmul-inseltpoison.ll
@@ -0,0 +1,1176 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; (-0.0 - X) * C => X * -C
+define float @neg_constant(float %x) {
+; CHECK-LABEL: @neg_constant(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf float [[X:%.*]], -2.000000e+01
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub = fsub float -0.0, %x
+  %mul = fmul ninf float %sub, 2.0e+1
+  ret float %mul
+}
+
+define float @unary_neg_constant(float %x) {
+; CHECK-LABEL: @unary_neg_constant(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf float [[X:%.*]], -2.000000e+01
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub = fneg float %x
+  %mul = fmul ninf float %sub, 2.0e+1
+  ret float %mul
+}
+
+define <2 x float> @neg_constant_vec(<2 x float> %x) {
+; CHECK-LABEL: @neg_constant_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf <2 x float> [[X:%.*]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %mul = fmul ninf <2 x float> %sub, <float 2.0, float 3.0>
+  ret <2 x float> %mul
+}
+
+define <2 x float> @unary_neg_constant_vec(<2 x float> %x) {
+; CHECK-LABEL: @unary_neg_constant_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf <2 x float> [[X:%.*]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fneg <2 x float> %x
+  %mul = fmul ninf <2 x float> %sub, <float 2.0, float 3.0>
+  ret <2 x float> %mul
+}
+
+define <2 x float> @neg_constant_vec_undef(<2 x float> %x) {
+; CHECK-LABEL: @neg_constant_vec_undef(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf <2 x float> [[X:%.*]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fsub <2 x float> <float undef, float -0.0>, %x
+  %mul = fmul ninf <2 x float> %sub, <float 2.0, float 3.0>
+  ret <2 x float> %mul
+}
+
+; (0.0 - X) * C => X * -C
+define float @neg_nsz_constant(float %x) {
+; CHECK-LABEL: @neg_nsz_constant(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul nnan float [[X:%.*]], -2.000000e+01
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub = fsub nsz float 0.0, %x
+  %mul = fmul nnan float %sub, 2.0e+1
+  ret float %mul
+}
+
+define float @unary_neg_nsz_constant(float %x) {
+; CHECK-LABEL: @unary_neg_nsz_constant(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul nnan float [[X:%.*]], -2.000000e+01
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub = fneg nsz float %x
+  %mul = fmul nnan float %sub, 2.0e+1
+  ret float %mul
+}
+
+; (-0.0 - X) * (-0.0 - Y) => X * Y
+define float @neg_neg(float %x, float %y) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fsub float -0.0, %x
+  %sub2 = fsub float -0.0, %y
+  %mul = fmul arcp float %sub1, %sub2
+  ret float %mul
+}
+
+define float @unary_neg_unary_neg(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_unary_neg(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fneg float %x
+  %sub2 = fneg float %y
+  %mul = fmul arcp float %sub1, %sub2
+  ret float %mul
+}
+
+define float @unary_neg_neg(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_neg(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fneg float %x
+  %sub2 = fsub float -0.0, %y
+  %mul = fmul arcp float %sub1, %sub2
+  ret float %mul
+}
+
+define float @neg_unary_neg(float %x, float %y) {
+; CHECK-LABEL: @neg_unary_neg(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fsub float -0.0, %x
+  %sub2 = fneg float %y
+  %mul = fmul arcp float %sub1, %sub2
+  ret float %mul
+}
+
+define <2 x float> @neg_neg_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_neg_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub1 = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %sub2 = fsub <2 x float> <float -0.0, float -0.0>, %y
+  %mul = fmul arcp <2 x float> %sub1, %sub2
+  ret <2 x float> %mul
+}
+
+define <2 x float> @unary_neg_unary_neg_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_neg_unary_neg_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub1 = fneg <2 x float> %x
+  %sub2 = fneg <2 x float> %y
+  %mul = fmul arcp <2 x float> %sub1, %sub2
+  ret <2 x float> %mul
+}
+
+define <2 x float> @unary_neg_neg_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_neg_neg_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub1 = fneg <2 x float> %x
+  %sub2 = fsub <2 x float> <float -0.0, float -0.0>, %y
+  %mul = fmul arcp <2 x float> %sub1, %sub2
+  ret <2 x float> %mul
+}
+
+define <2 x float> @neg_unary_neg_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_unary_neg_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub1 = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %sub2 = fneg <2 x float> %y
+  %mul = fmul arcp <2 x float> %sub1, %sub2
+  ret <2 x float> %mul
+}
+
+define <2 x float> @neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_neg_vec_undef(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub1 = fsub <2 x float> <float -0.0, float undef>, %x
+  %sub2 = fsub <2 x float> <float undef, float -0.0>, %y
+  %mul = fmul arcp <2 x float> %sub1, %sub2
+  ret <2 x float> %mul
+}
+
+define <2 x float> @unary_neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_neg_neg_vec_undef(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %neg = fneg <2 x float> %x
+  %sub = fsub <2 x float> <float undef, float -0.0>, %y
+  %mul = fmul arcp <2 x float> %neg, %sub
+  ret <2 x float> %mul
+}
+
+define <2 x float> @neg_unary_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_unary_neg_vec_undef(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fsub <2 x float> <float -0.0, float undef>, %x
+  %neg = fneg <2 x float> %y
+  %mul = fmul arcp <2 x float> %sub, %neg
+  ret <2 x float> %mul
+}
+
+; (0.0 - X) * (0.0 - Y) => X * Y
+define float @neg_neg_nsz(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_nsz(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fsub nsz float 0.0, %x
+  %sub2 = fsub nsz float 0.0, %y
+  %mul = fmul afn float %sub1, %sub2
+  ret float %mul
+}
+
+declare void @use_f32(float)
+
+define float @neg_neg_multi_use(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_multi_use(
+; CHECK-NEXT:    [[NX:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = fneg float [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[X]], [[Y]]
+; CHECK-NEXT:    call void @use_f32(float [[NX]])
+; CHECK-NEXT:    call void @use_f32(float [[NY]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %nx = fsub float -0.0, %x
+  %ny = fsub float -0.0, %y
+  %mul = fmul afn float %nx, %ny
+  call void @use_f32(float %nx)
+  call void @use_f32(float %ny)
+  ret float %mul
+}
+
+define float @unary_neg_unary_neg_multi_use(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_unary_neg_multi_use(
+; CHECK-NEXT:    [[NX:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = fneg float [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[X]], [[Y]]
+; CHECK-NEXT:    call void @use_f32(float [[NX]])
+; CHECK-NEXT:    call void @use_f32(float [[NY]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %nx = fneg float %x
+  %ny = fneg float %y
+  %mul = fmul afn float %nx, %ny
+  call void @use_f32(float %nx)
+  call void @use_f32(float %ny)
+  ret float %mul
+}
+
+define float @unary_neg_neg_multi_use(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_neg_multi_use(
+; CHECK-NEXT:    [[NX:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = fneg float [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[X]], [[Y]]
+; CHECK-NEXT:    call void @use_f32(float [[NX]])
+; CHECK-NEXT:    call void @use_f32(float [[NY]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %nx = fneg float %x
+  %ny = fsub float -0.0, %y
+  %mul = fmul afn float %nx, %ny
+  call void @use_f32(float %nx)
+  call void @use_f32(float %ny)
+  ret float %mul
+}
+
+define float @neg_unary_neg_multi_use(float %x, float %y) {
+; CHECK-LABEL: @neg_unary_neg_multi_use(
+; CHECK-NEXT:    [[NX:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = fneg float [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[X]], [[Y]]
+; CHECK-NEXT:    call void @use_f32(float [[NX]])
+; CHECK-NEXT:    call void @use_f32(float [[NY]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %nx = fsub float -0.0, %x
+  %ny = fneg float %y
+  %mul = fmul afn float %nx, %ny
+  call void @use_f32(float %nx)
+  call void @use_f32(float %ny)
+  ret float %mul
+}
+
+; (-0.0 - X) * Y
+define float @neg_mul(float %x, float %y) {
+; CHECK-LABEL: @neg_mul(
+; CHECK-NEXT:    [[SUB:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUB]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub = fsub float -0.0, %x
+  %mul = fmul float %sub, %y
+  ret float %mul
+}
+
+define float @unary_neg_mul(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_mul(
+; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %neg = fneg float %x
+  %mul = fmul float %neg, %y
+  ret float %mul
+}
+
+define <2 x float> @neg_mul_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_mul_vec(
+; CHECK-NEXT:    [[SUB:%.*]] = fneg <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %mul = fmul <2 x float> %sub, %y
+  ret <2 x float> %mul
+}
+
+define <2 x float> @unary_neg_mul_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_neg_mul_vec(
+; CHECK-NEXT:    [[SUB:%.*]] = fneg <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fneg <2 x float> %x
+  %mul = fmul <2 x float> %sub, %y
+  ret <2 x float> %mul
+}
+
+define <2 x float> @neg_mul_vec_undef(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_mul_vec_undef(
+; CHECK-NEXT:    [[SUB:%.*]] = fneg <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sub = fsub <2 x float> <float undef, float -0.0>, %x
+  %mul = fmul <2 x float> %sub, %y
+  ret <2 x float> %mul
+}
+
+; (0.0 - X) * Y
+define float @neg_sink_nsz(float %x, float %y) {
+; CHECK-LABEL: @neg_sink_nsz(
+; CHECK-NEXT:    [[SUB1:%.*]] = fneg nsz float [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sub1 = fsub nsz float 0.0, %x
+  %mul = fmul float %sub1, %y
+  ret float %mul
+}
+
+define float @neg_sink_multi_use(float %x, float %y) {
+; CHECK-LABEL: @neg_sink_multi_use(
+; CHECK-NEXT:    [[SUB1:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[MUL]], [[SUB1]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %sub1 = fsub float -0.0, %x
+  %mul = fmul float %sub1, %y
+  %mul2 = fmul float %mul, %sub1
+  ret float %mul2
+}
+
+define float @unary_neg_mul_multi_use(float %x, float %y) {
+; CHECK-LABEL: @unary_neg_mul_multi_use(
+; CHECK-NEXT:    [[SUB1:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[MUL]], [[SUB1]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %sub1 = fneg float %x
+  %mul = fmul float %sub1, %y
+  %mul2 = fmul float %mul, %sub1
+  ret float %mul2
+}
+
+; Don't crash when attempting to cast a constant FMul to an instruction.
+define void @test8(i32* %inout) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[LOCAL_VAR_7_0:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0]] = insertelement <4 x float> [[LOCAL_VAR_7_0]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %inout, align 4
+  %conv = uitofp i32 %0 to float
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, float %conv, i32 3
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vecinit
+  %1 = shufflevector <4 x float> %sub, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> zeroinitializer, %1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %local_var_7.0 = phi <4 x float> [ %mul, %entry ], [ %2, %for.body ]
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = insertelement <4 x float> %local_var_7.0, float 0.000000e+00, i32 2
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; X * -1.0 => -0.0 - X
+define float @test9(float %x) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[MUL:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %mul = fmul float %x, -1.0
+  ret float %mul
+}
+
+; PR18532
+define <4 x float> @test10(<4 x float> %x) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[MUL:%.*]] = fneg arcp afn <4 x float> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[MUL]]
+;
+  %mul = fmul arcp afn <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
+  ret <4 x float> %mul
+}
+
+define float @test11(float %x, float %y) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fadd fast float [[B]], 3.000000e+00
+; CHECK-NEXT:    ret float [[C]]
+;
+  %a = fadd fast float %x, 1.0
+  %b = fadd fast float %y, 2.0
+  %c = fadd fast float %a, %b
+  ret float %c
+}
+
+declare double @llvm.sqrt.f64(double)
+
+; With unsafe/fast math, sqrt(X) * sqrt(X) is just X,
+; but make sure another use of the sqrt is intact.
+; Note that the remaining fmul is altered but is not 'fast'
+; itself because it was not marked 'fast' originally.
+; Thus, we have an overall fast result, but no more indication of
+; 'fast'ness in the code.
+define double @sqrt_squared2(double %f) {
+; CHECK-LABEL: @sqrt_squared2(
+; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[F:%.*]])
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul double [[SQRT]], [[F]]
+; CHECK-NEXT:    ret double [[MUL2]]
+;
+  %sqrt = call double @llvm.sqrt.f64(double %f)
+  %mul1 = fmul fast double %sqrt, %sqrt
+  %mul2 = fmul double %mul1, %sqrt
+  ret double %mul2
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+define float @fabs_squared(float %x) {
+; CHECK-LABEL: @fabs_squared(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X:%.*]], [[X]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %mul = fmul float %x.fabs, %x.fabs
+  ret float %mul
+}
+
+define float @fabs_squared_fast(float %x) {
+; CHECK-LABEL: @fabs_squared_fast(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[X:%.*]], [[X]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %mul = fmul fast float %x.fabs, %x.fabs
+  ret float %mul
+}
+
+define float @fabs_fabs(float %x, float %y) {
+; CHECK-LABEL: @fabs_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %y.fabs = call float @llvm.fabs.f32(float %y)
+  %mul = fmul float %x.fabs, %y.fabs
+  ret float %mul
+}
+
+define float @fabs_fabs_extra_use1(float %x, float %y) {
+; CHECK-LABEL: @fabs_fabs_extra_use1(
+; CHECK-NEXT:    [[X_FABS:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    call void @use_f32(float [[X_FABS]])
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul ninf float [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = call ninf float @llvm.fabs.f32(float [[TMP1]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  call void @use_f32(float %x.fabs)
+  %y.fabs = call float @llvm.fabs.f32(float %y)
+  %mul = fmul ninf float %x.fabs, %y.fabs
+  ret float %mul
+}
+
+define float @fabs_fabs_extra_use2(float %x, float %y) {
+; CHECK-LABEL: @fabs_fabs_extra_use2(
+; CHECK-NEXT:    [[Y_FABS:%.*]] = call fast float @llvm.fabs.f32(float [[Y:%.*]])
+; CHECK-NEXT:    call void @use_f32(float [[Y_FABS]])
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc ninf float [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[MUL:%.*]] = call reassoc ninf float @llvm.fabs.f32(float [[TMP1]])
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call fast float @llvm.fabs.f32(float %x)
+  %y.fabs = call fast float @llvm.fabs.f32(float %y)
+  call void @use_f32(float %y.fabs)
+  %mul = fmul reassoc ninf float %x.fabs, %y.fabs
+  ret float %mul
+}
+
+; negative test - don't create an extra instruction
+
+define float @fabs_fabs_extra_use3(float %x, float %y) {
+; CHECK-LABEL: @fabs_fabs_extra_use3(
+; CHECK-NEXT:    [[X_FABS:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    call void @use_f32(float [[X_FABS]])
+; CHECK-NEXT:    [[Y_FABS:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]])
+; CHECK-NEXT:    call void @use_f32(float [[Y_FABS]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X_FABS]], [[Y_FABS]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  call void @use_f32(float %x.fabs)
+  %y.fabs = call float @llvm.fabs.f32(float %y)
+  call void @use_f32(float %y.fabs)
+  %mul = fmul float %x.fabs, %y.fabs
+  ret float %mul
+}
+
+; (X*Y) * X => (X*X) * Y
+; The transform only requires 'reassoc', but test other FMF in
+; the commuted variants to make sure FMF propagates as expected.
+
+define float @reassoc_common_operand1(float %x, float %y) {
+; CHECK-LABEL: @reassoc_common_operand1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul reassoc float [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %mul1 = fmul float %x, %y
+  %mul2 = fmul reassoc float %mul1, %x
+  ret float %mul2
+}
+
+; (Y*X) * X => (X*X) * Y
+
+define float @reassoc_common_operand2(float %x, float %y) {
+; CHECK-LABEL: @reassoc_common_operand2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast float [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %mul1 = fmul float %y, %x
+  %mul2 = fmul fast float %mul1, %x
+  ret float %mul2
+}
+
+; X * (X*Y) => (X*X) * Y
+
+define float @reassoc_common_operand3(float %x1, float %y) {
+; CHECK-LABEL: @reassoc_common_operand3(
+; CHECK-NEXT:    [[X:%.*]] = fdiv float [[X1:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc nnan float [[X]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul reassoc nnan float [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %x = fdiv float %x1, 3.0 ; thwart complexity-based canonicalization
+  %mul1 = fmul float %x, %y
+  %mul2 = fmul reassoc nnan float %x, %mul1
+  ret float %mul2
+}
+
+; X * (Y*X) => (X*X) * Y
+
+define float @reassoc_common_operand4(float %x1, float %y) {
+; CHECK-LABEL: @reassoc_common_operand4(
+; CHECK-NEXT:    [[X:%.*]] = fdiv float [[X1:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc ninf float [[X]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul reassoc ninf float [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %x = fdiv float %x1, 3.0 ; thwart complexity-based canonicalization
+  %mul1 = fmul float %y, %x
+  %mul2 = fmul reassoc ninf float %x, %mul1
+  ret float %mul2
+}
+
+; No change if the first fmul has another use.
+
+define float @reassoc_common_operand_multi_use(float %x, float %y) {
+; CHECK-LABEL: @reassoc_common_operand_multi_use(
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast float [[MUL1]], [[X]]
+; CHECK-NEXT:    call void @use_f32(float [[MUL1]])
+; CHECK-NEXT:    ret float [[MUL2]]
+;
+  %mul1 = fmul float %x, %y
+  %mul2 = fmul fast float %mul1, %x
+  call void @use_f32(float %mul1)
+  ret float %mul2
+}
+
+declare float @llvm.log2.f32(float)
+
+; log2(Y * 0.5) * X = log2(Y) * X - X
+
+define float @log2half(float %x, float %y) {
+; CHECK-LABEL: @log2half(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.log2.f32(float [[Y:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fsub fast float [[TMP2]], [[X]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %halfy = fmul float %y, 0.5
+  %log2 = call float @llvm.log2.f32(float %halfy)
+  %mul = fmul fast float %log2, %x
+  ret float %mul
+}
+
+define float @log2half_commute(float %x1, float %y) {
+; CHECK-LABEL: @log2half_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.log2.f32(float [[Y:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast float [[TMP2]], [[X1]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP3]], 0x3FC24924A0000000
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %x = fdiv float %x1, 7.0 ; thwart complexity-based canonicalization
+  %halfy = fmul float %y, 0.5
+  %log2 = call float @llvm.log2.f32(float %halfy)
+  %mul = fmul fast float %x, %log2
+  ret float %mul
+}
+
+; C1/X * C2 => (C1*C2) / X
+
+define float @fdiv_constant_numerator_fmul(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul(
+; CHECK-NEXT:    [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv float 2.0e+3, %x
+  %t3 = fmul reassoc float %t1, 6.0e+3
+  ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X is disabled if C1/X has multiple uses
+
+ at fmul2_external = external global float
+
+define float @fdiv_constant_numerator_fmul_extra_use(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_extra_use(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast float 1.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    store float [[DIV]], float* @fmul2_external, align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[DIV]], 2.000000e+00
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %div = fdiv fast float 1.0, %x
+  store float %div, float* @fmul2_external
+  %mul = fmul fast float %div, 2.0
+  ret float %mul
+}
+
+; X/C1 * C2 => X * (C2/C1) (if C2/C1 is normal FP)
+
+define float @fdiv_constant_denominator_fmul(float %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul(
+; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv float %x, 2.0e+3
+  %t3 = fmul reassoc float %t1, 6.0e+3
+  ret float %t3
+}
+
+define <4 x float> @fdiv_constant_denominator_fmul_vec(<4 x float> %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul_vec(
+; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+  %t3 = fmul reassoc <4 x float> %t1, <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3>
+  ret <4 x float> %t3
+}
+
+; Make sure fmul with constant expression doesn't assert.
+
+define <4 x float> @fdiv_constant_denominator_fmul_vec_constexpr(<4 x float> %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul_vec_constexpr(
+; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3, float undef> to i160) to i128) to <4 x float>
+  %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+  %t3 = fmul reassoc <4 x float> %t1, %constExprMul
+  ret <4 x float> %t3
+}
+
+; This shows that at least part of instcombine does not check constant
+; values to see if it is creating denorms (0x3800000000000000 is a denorm
+; for 32-bit float), so protecting against denorms in other parts is
+; probably not doing the intended job.
+
+define float @fmul_constant_reassociation(float %x) {
+; CHECK-LABEL: @fmul_constant_reassociation(
+; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], 0x3800000000000000
+; CHECK-NEXT:    ret float [[R]]
+;
+  %mul_flt_min = fmul reassoc nsz float %x, 0x3810000000000000
+  %r = fmul reassoc nsz float  %mul_flt_min, 0.5
+  ret float %r
+}
+
+; Canonicalization "X/C1 * C2 => X * (C2/C1)" still applies if C2/C1 is denormal
+; (otherwise, we should not have allowed the reassociation in the previous test).
+; 0x3810000000000000 == FLT_MIN
+
+define float @fdiv_constant_denominator_fmul_denorm(float %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul_denorm(
+; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[X:%.*]], 0x3760620000000000
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv float %x, 2.0e+3
+  %t3 = fmul fast float %t1, 0x3810000000000000
+  ret float %t3
+}
+
+; X / C1 * C2 => X / (C2/C1) if C1/C2 is abnormal, but C2/C1 is a normal value.
+; TODO: We don't convert the fast fdiv to fmul because that would be multiplication
+; by a denormal, but we could do better when we know that denormals are not a problem.
+
+define float @fdiv_constant_denominator_fmul_denorm_try_harder(float %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul_denorm_try_harder(
+; CHECK-NEXT:    [[T3:%.*]] = fdiv reassoc float [[X:%.*]], 0x47E8000000000000
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv float %x, 3.0
+  %t3 = fmul reassoc float %t1, 0x3810000000000000
+  ret float %t3
+}
+
+; Negative test: we should not have 2 divisions instead of the 1 we started with.
+
+define float @fdiv_constant_denominator_fmul_denorm_try_harder_extra_use(float %x) {
+; CHECK-LABEL: @fdiv_constant_denominator_fmul_denorm_try_harder_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = fdiv float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[T1]], 0x3810000000000000
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[T1]], [[T3]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %t1 = fdiv float %x, 3.0e+0
+  %t3 = fmul fast float %t1, 0x3810000000000000
+  %r = fadd float %t1, %t3
+  ret float %r
+}
+
+; (X + C1) * C2 --> (X * C2) + C1*C2
+
+define float @fmul_fadd_distribute(float %x) {
+; CHECK-LABEL: @fmul_fadd_distribute(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc float [[TMP1]], 6.000000e+00
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t2 = fadd float %x, 2.0
+  %t3 = fmul reassoc float %t2, 3.0
+  ret float %t3
+}
+
+; (X - C1) * C2 --> (X * C2) - C1*C2
+
+define float @fmul_fsub_distribute1(float %x) {
+; CHECK-LABEL: @fmul_fsub_distribute1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc float [[TMP1]], -6.000000e+00
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t2 = fsub float %x, 2.0
+  %t3 = fmul reassoc float %t2, 3.0
+  ret float %t3
+}
+
+; (C1 - X) * C2 --> C1*C2 - (X * C2)
+
+define float @fmul_fsub_distribute2(float %x) {
+; CHECK-LABEL: @fmul_fsub_distribute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[T3:%.*]] = fsub reassoc float 6.000000e+00, [[TMP1]]
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t2 = fsub float 2.0, %x
+  %t3 = fmul reassoc float %t2, 3.0
+  ret float %t3
+}
+
+; FIXME: This should only need 'reassoc'.
+; ((X*C1) + C2) * C3 => (X * (C1*C3)) + (C2*C3)
+
+define float @fmul_fadd_fmul_distribute(float %x) {
+; CHECK-LABEL: @fmul_fadd_fmul_distribute(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float [[X:%.*]], 3.000000e+01
+; CHECK-NEXT:    [[T3:%.*]] = fadd fast float [[TMP1]], 1.000000e+01
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fadd float %t1, 2.0
+  %t3 = fmul fast float %t2, 5.0
+  ret float %t3
+}
+
+define float @fmul_fadd_distribute_extra_use(float %x) {
+; CHECK-LABEL: @fmul_fadd_distribute_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = fmul float [[X:%.*]], 6.000000e+00
+; CHECK-NEXT:    [[T2:%.*]] = fadd float [[T1]], 2.000000e+00
+; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[T2]], 5.000000e+00
+; CHECK-NEXT:    call void @use_f32(float [[T2]])
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fadd float %t1, 2.0
+  %t3 = fmul fast float %t2, 5.0
+  call void @use_f32(float %t2)
+  ret float %t3
+}
+
+; (X/C1 + C2) * C3 => X/(C1/C3) + C2*C3
+; 0x10000000000000 = DBL_MIN
+; TODO: We don't convert the fast fdiv to fmul because that would be multiplication
+; by a denormal, but we could do better when we know that denormals are not a problem.
+
+define double @fmul_fadd_fdiv_distribute2(double %x) {
+; CHECK-LABEL: @fmul_fadd_fdiv_distribute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double [[X:%.*]], 0x7FE8000000000000
+; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
+; CHECK-NEXT:    ret double [[T3]]
+;
+  %t1 = fdiv double %x, 3.0
+  %t2 = fadd double %t1, 5.0
+  %t3 = fmul reassoc double %t2, 0x10000000000000
+  ret double %t3
+}
+
+; 5.0e-1 * DBL_MIN yields denormal, so "(f1*3.0 + 5.0e-1) * DBL_MIN" cannot
+; be simplified into f1 * (3.0*DBL_MIN) + (5.0e-1*DBL_MIN)
+
+define double @fmul_fadd_fdiv_distribute3(double %x) {
+; CHECK-LABEL: @fmul_fadd_fdiv_distribute3(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double [[X:%.*]], 0x7FE8000000000000
+; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
+; CHECK-NEXT:    ret double [[T3]]
+;
+  %t1 = fdiv double %x, 3.0
+  %t2 = fadd double %t1, 5.0
+  %t3 = fmul reassoc double %t2, 0x10000000000000
+  ret double %t3
+}
+
+; FIXME: This should only need 'reassoc'.
+; (C2 - (X*C1)) * C3 => (C2*C3) - (X * (C1*C3))
+
+define float @fmul_fsub_fmul_distribute(float %x) {
+; CHECK-LABEL: @fmul_fsub_fmul_distribute(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float [[X:%.*]], 3.000000e+01
+; CHECK-NEXT:    [[T3:%.*]] = fsub fast float 1.000000e+01, [[TMP1]]
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fsub float 2.0, %t1
+  %t3 = fmul fast float %t2, 5.0
+  ret float %t3
+}
+
+define float @fmul_fsub_fmul_distribute_extra_use(float %x) {
+; CHECK-LABEL: @fmul_fsub_fmul_distribute_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = fmul float [[X:%.*]], 6.000000e+00
+; CHECK-NEXT:    [[T2:%.*]] = fsub float 2.000000e+00, [[T1]]
+; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[T2]], 5.000000e+00
+; CHECK-NEXT:    call void @use_f32(float [[T2]])
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fsub float 2.0, %t1
+  %t3 = fmul fast float %t2, 5.0
+  call void @use_f32(float %t2)
+  ret float %t3
+}
+
+; FIXME: This should only need 'reassoc'.
+; ((X*C1) - C2) * C3 => (X * (C1*C3)) - C2*C3
+
+define float @fmul_fsub_fmul_distribute2(float %x) {
+; CHECK-LABEL: @fmul_fsub_fmul_distribute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float [[X:%.*]], 3.000000e+01
+; CHECK-NEXT:    [[T3:%.*]] = fadd fast float [[TMP1]], -1.000000e+01
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fsub float %t1, 2.0
+  %t3 = fmul fast float %t2, 5.0
+  ret float %t3
+}
+
+define float @fmul_fsub_fmul_distribute2_extra_use(float %x) {
+; CHECK-LABEL: @fmul_fsub_fmul_distribute2_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = fmul float [[X:%.*]], 6.000000e+00
+; CHECK-NEXT:    [[T2:%.*]] = fsub float 2.000000e+00, [[T1]]
+; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[T2]], 5.000000e+00
+; CHECK-NEXT:    call void @use_f32(float [[T2]])
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fmul float %x, 6.0
+  %t2 = fsub float 2.0, %t1
+  %t3 = fmul fast float %t2, 5.0
+  call void @use_f32(float %t2)
+  ret float %t3
+}
+
+; "(X*Y) * X => (X*X) * Y" is disabled if "X*Y" has multiple uses
+
+define float @common_factor(float %x, float %y) {
+; CHECK-LABEL: @common_factor(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast float [[MUL]], [[X]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL1]], [[MUL]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %mul = fmul float %x, %y
+  %mul1 = fmul fast float %mul, %x
+  %add = fadd float %mul1, %mul
+  ret float %add
+}
+
+define double @fmul_fdiv_factor_squared(double %x, double %y) {
+; CHECK-LABEL: @fmul_fdiv_factor_squared(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv fast double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SQUARED:%.*]] = fmul fast double [[DIV]], [[DIV]]
+; CHECK-NEXT:    ret double [[SQUARED]]
+;
+  %div = fdiv fast double %x, %y
+  %squared = fmul fast double %div, %div
+  ret double %squared
+}
+
+define double @fmul_fdivs_factor_common_denominator(double %x, double %y, double %z) {
+; CHECK-LABEL: @fmul_fdivs_factor_common_denominator(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast double [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast double [[Z:%.*]], [[Z]]
+; CHECK-NEXT:    [[MUL:%.*]] = fdiv fast double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %div1 = fdiv double %x, %z
+  %div2 = fdiv double %y, %z
+  %mul = fmul fast double %div1, %div2
+  ret double %mul
+}
+
+define double @fmul_fdivs_factor(double %x, double %y, double %z, double %w) {
+; CHECK-LABEL: @fmul_fdivs_factor(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc double [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv reassoc double [[TMP1]], [[W:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %div1 = fdiv double %x, %y
+  %div2 = fdiv double %z, %w
+  %mul = fmul reassoc double %div1, %div2
+  ret double %mul
+}
+
+define double @fmul_fdiv_factor(double %x, double %y, double %z) {
+; CHECK-LABEL: @fmul_fdiv_factor(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc double [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %div = fdiv double %x, %y
+  %mul = fmul reassoc double %div, %z
+  ret double %mul
+}
+
+define double @fmul_fdiv_factor_constant1(double %x, double %y) {
+; CHECK-LABEL: @fmul_fdiv_factor_constant1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %div = fdiv double %x, %y
+  %mul = fmul reassoc double %div, 42.0
+  ret double %mul
+}
+
+define <2 x float> @fmul_fdiv_factor_constant2(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fmul_fdiv_factor_constant2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc <2 x float> [[TMP1]], <float 4.200000e+01, float 1.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %div = fdiv <2 x float> %x, <float 42.0, float 12.0>
+  %mul = fmul reassoc <2 x float> %div, %y
+  ret <2 x float> %mul
+}
+
+define float @fmul_fdiv_factor_extra_use(float %x, float %y) {
+; CHECK-LABEL: @fmul_fdiv_factor_extra_use(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use_f32(float [[DIV]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc float [[DIV]], [[Y:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %div = fdiv float %x, 42.0
+  call void @use_f32(float %div)
+  %mul = fmul reassoc float %div, %y
+  ret float %mul
+}
+
+; Avoid infinite looping by moving negation out of a constant expression.
+
+ at g = external global {[2 x i8*]}, align 1
+
+define double @fmul_negated_constant_expression(double %x) {
+; CHECK-LABEL: @fmul_negated_constant_expression(
+; CHECK-NEXT:    [[R:%.*]] = fmul double [[X:%.*]], fsub (double -0.000000e+00, double bitcast (i64 ptrtoint (i8** getelementptr inbounds ({ [2 x i8*] }, { [2 x i8*] }* @g, i64 0, inrange i32 0, i64 2) to i64) to double))
+; CHECK-NEXT:    ret double [[R]]
+;
+  %r = fmul double %x, fsub (double -0.000000e+00, double bitcast (i64 ptrtoint (i8** getelementptr inbounds ({ [2 x i8*] }, { [2 x i8*] }* @g, i64 0, inrange i32 0, i64 2) to i64) to double))
+  ret double %r
+}
+
+define float @negate_if_true(float %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], float [[TMP1]], float [[X]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %sel = select i1 %cond, float -1.0, float 1.0
+  %r = fmul float %sel, %x
+  ret float %r
+}
+
+define float @negate_if_false(float %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg arcp float [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select arcp i1 [[COND:%.*]], float [[X]], float [[TMP1]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %sel = select i1 %cond, float 1.0, float -1.0
+  %r = fmul arcp float %sel, %x
+  ret float %r
+}
+
+define <2 x double> @negate_if_true_commute(<2 x double> %px, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_commute(
+; CHECK-NEXT:    [[X:%.*]] = fdiv <2 x double> <double 4.200000e+01, double 4.200000e+01>, [[PX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg ninf <2 x double> [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select ninf i1 [[COND:%.*]], <2 x double> [[TMP1]], <2 x double> [[X]]
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %x = fdiv <2 x double> <double 42.0, double 42.0>, %px  ; thwart complexity-based canonicalization
+  %sel = select i1 %cond, <2 x double> <double -1.0, double -1.0>, <2 x double> <double 1.0, double 1.0>
+  %r = fmul ninf <2 x double> %x, %sel
+  ret <2 x double> %r
+}
+
+define <2 x double> @negate_if_false_commute(<2 x double> %px, <2 x i1> %cond) {
+; CHECK-LABEL: @negate_if_false_commute(
+; CHECK-NEXT:    [[X:%.*]] = fdiv <2 x double> <double 4.200000e+01, double 5.100000e+00>, [[PX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x double> [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[X]], <2 x double> [[TMP1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %x = fdiv <2 x double> <double 42.0, double 5.1>, %px  ; thwart complexity-based canonicalization
+  %sel = select <2 x i1> %cond, <2 x double> <double 1.0, double 1.0>, <2 x double> <double -1.0, double -1.0>
+  %r = fmul <2 x double> %x, %sel
+  ret <2 x double> %r
+}
+
+; Negative test
+
+define float @negate_if_true_extra_use(float %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_extra_use(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], float -1.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    call void @use_f32(float [[SEL]])
+; CHECK-NEXT:    [[R:%.*]] = fmul float [[SEL]], [[X:%.*]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %sel = select i1 %cond, float -1.0, float 1.0
+  call void @use_f32(float %sel)
+  %r = fmul float %sel, %x
+  ret float %r
+}
+
+; Negative test
+
+define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_wrong_constant(
+; CHECK-NEXT:    [[X:%.*]] = fdiv <2 x double> <double 4.200000e+01, double 4.200000e+01>, [[PX:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], <2 x double> <double -1.000000e+00, double 0.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x double> [[X]], [[SEL]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %x = fdiv <2 x double> <double 42.0, double 42.0>, %px  ; thwart complexity-based canonicalization
+  %sel = select i1 %cond, <2 x double> <double -1.0, double 0.0>, <2 x double> <double 1.0, double 1.0>
+  %r = fmul <2 x double> %x, %sel
+  ret <2 x double> %r
+}
+
+; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
+define float @fmul_select(float %x, i1 %c) {
+; CHECK-LABEL: @fmul_select(
+; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sel = select i1 %c, float 1.0, float 0.0
+  %mul = fmul fast float %sel, %x
+  ret float %mul
+}
+
+; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
+define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) {
+; CHECK-LABEL: @fmul_select_vec(
+; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
+; CHECK-NEXT:    ret <2 x float> [[MUL]]
+;
+  %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer
+  %mul = fmul fast <2 x float> %sel, %x
+  ret <2 x float> %mul
+}
+
+; Without fast math flags we can't optimize X * (C ? 1.0 : 0.0) -> C ? X : 0.0
+define float @fmul_select_strict(float %x, i1 %c) {
+; CHECK-LABEL: @fmul_select_strict(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], float 1.000000e+00, float 0.000000e+00
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SEL]], [[X:%.*]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %sel = select i1 %c, float 1.0, float 0.0
+  %mul = fmul float %sel, %x
+  ret float %mul
+}
+
+; sqrt(X) *fast (C ? sqrt(X) : 1.0) -> C ? X : sqrt(X)
+define double @fmul_sqrt_select(double %x, i1 %c) {
+; CHECK-LABEL: @fmul_sqrt_select(
+; CHECK-NEXT:    [[SQR:%.*]] = call double @llvm.sqrt.f64(double [[X:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], double [[X]], double [[SQR]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %sqr = call double @llvm.sqrt.f64(double %x)
+  %sel = select i1 %c, double %sqr, double 1.0
+  %mul = fmul fast double %sqr, %sel
+  ret double %mul
+}
+
+; fastmath => z * splat(0) = splat(0), even for scalable vectors
+define <vscale x 2 x float> @mul_scalable_splat_zero(<vscale x 2 x float> %z) {
+; CHECK-LABEL: @mul_scalable_splat_zero(
+; CHECK-NEXT:    ret <vscale x 2 x float> zeroinitializer
+;
+  %shuf = shufflevector <vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 0.0, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
+  %t3 = fmul fast <vscale x 2 x float> %shuf, %z
+  ret <vscale x 2 x float> %t3
+}

diff  --git a/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll b/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
index 5df11093b147..b7e61eb0d34e 100644
--- a/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
@@ -18,7 +18,7 @@ define i1 @test_i1_0(i1 %val) {
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i1> poison, i1 %val, i32 0
-  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> zeroinitializer
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> poison, <4 x i32> zeroinitializer
   %cast = bitcast <4 x i1> %vec to i4
   %cond = icmp eq i4 %cast, 0
   ret i1 %cond
@@ -30,7 +30,7 @@ define i1 @test_i1_0_2(i1 %val) {
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i1> poison, i1 %val, i32 2
-  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %cast = bitcast <4 x i1> %vec to i4
   %cond = icmp eq i4 %cast, 0
   ret i1 %cond
@@ -41,7 +41,7 @@ define i1 @test_i1_m1(i1 %val) {
 ; CHECK-NEXT:    ret i1 [[VAL:%.*]]
 ;
   %insvec = insertelement <4 x i1> poison, i1 %val, i32 0
-  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> zeroinitializer
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> poison, <4 x i32> zeroinitializer
   %cast = bitcast <4 x i1> %vec to i4
   %cond = icmp eq i4 %cast, -1
   ret i1 %cond
@@ -53,7 +53,7 @@ define i1 @test_i8_pattern(i8 %val) {
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
-  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> poison, <4 x i32> zeroinitializer
   %cast = bitcast <4 x i8> %vec to i32
   %cond = icmp eq i32 %cast, 1212696648
   ret i1 %cond
@@ -65,7 +65,7 @@ define i1 @test_i8_pattern_2(i8 %val) {
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i8> poison, i8 %val, i32 2
-  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %cast = bitcast <4 x i8> %vec to i32
   %cond = icmp eq i32 %cast, 1212696648
   ret i1 %cond
@@ -74,12 +74,12 @@ define i1 @test_i8_pattern_2(i8 %val) {
 ; Make sure we don't try to fold if the shufflemask has 
diff ering element values
 define i1 @test_i8_pattern_3(<4 x i8> %invec) {
 ; CHECK-LABEL: @test_i8_pattern_3(
-; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INVEC:%.*]], <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INVEC:%.*]], <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[CAST]], 1212696648
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
-  %vec = shufflevector <4 x i8> %invec, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %vec = shufflevector <4 x i8> %invec, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %cast = bitcast <4 x i8> %vec to i32
   %cond = icmp eq i32 %cast, 1212696648
   ret i1 %cond
@@ -89,13 +89,13 @@ define i1 @test_i8_pattern_3(<4 x i8> %invec) {
 define i1 @test_i8_nopattern(i8 %val) {
 ; CHECK-LABEL: @test_i8_nopattern(
 ; CHECK-NEXT:    [[INSVEC:%.*]] = insertelement <4 x i8> poison, i8 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INSVEC]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INSVEC]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[CAST]], 1212696647
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
-  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> poison, <4 x i32> zeroinitializer
   %cast = bitcast <4 x i8> %vec to i32
   %cond = icmp eq i32 %cast, 1212696647
   ret i1 %cond
@@ -108,7 +108,7 @@ define i1 @test_i8_ult_pattern(i8 %val) {
 ; CHECK-NEXT:    ret i1 [[COND]]
 ;
   %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
-  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> poison, <4 x i32> zeroinitializer
   %cast = bitcast <4 x i8> %vec to i32
   %cond = icmp ult i32 %cast, 1212696648
   ret i1 %cond
@@ -120,7 +120,7 @@ define i1 @extending_shuffle_with_weird_types(<2 x i9> %v) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i9 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %splat = shufflevector <2 x i9> %v, <2 x i9> undef, <3 x i32> zeroinitializer
+  %splat = shufflevector <2 x i9> %v, <2 x i9> poison, <3 x i32> zeroinitializer
   %cast = bitcast <3 x i9> %splat to i27
   %cmp = icmp slt i27 %cast, 262657 ; 0x040201
   ret i1 %cmp

diff  --git a/llvm/test/Transforms/InstCombine/icmp-vec-inseltpoison.ll b/llvm/test/Transforms/InstCombine/icmp-vec-inseltpoison.ll
new file mode 100644
index 000000000000..6fa9cfda4f45
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-vec-inseltpoison.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Canonicalize vector ge/le comparisons with constants to gt/lt.
+
+; Normal types are ConstantDataVectors. Test the constant values adjacent to the
+; min/max values that we're not allowed to transform.
+
+define <2 x i1> @sge(<2 x i8> %x) {
+; CHECK-LABEL: @sge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[X:%.*]], <i8 -128, i8 126>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sge <2 x i8> %x, <i8 -127, i8 -129>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @uge(<2 x i8> %x) {
+; CHECK-LABEL: @uge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> [[X:%.*]], <i8 -2, i8 0>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp uge <2 x i8> %x, <i8 -1, i8 1>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @sle(<2 x i8> %x) {
+; CHECK-LABEL: @sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], <i8 127, i8 -127>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sle <2 x i8> %x, <i8 126, i8 128>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ule(<2 x i8> %x) {
+; CHECK-LABEL: @ule(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> [[X:%.*]], <i8 -1, i8 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp ule <2 x i8> %x, <i8 254, i8 0>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ult_min_signed_value(<2 x i8> %x) {
+; CHECK-LABEL: @ult_min_signed_value(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp ult <2 x i8> %x, <i8 128, i8 128>
+  ret <2 x i1> %cmp
+}
+
+; Zeros are special: they're ConstantAggregateZero.
+
+define <2 x i1> @sge_zero(<2 x i8> %x) {
+; CHECK-LABEL: @sge_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sge <2 x i8> %x, <i8 0, i8 0>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @uge_zero(<2 x i8> %x) {
+; CHECK-LABEL: @uge_zero(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %cmp = icmp uge <2 x i8> %x, <i8 0, i8 0>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @sle_zero(<2 x i8> %x) {
+; CHECK-LABEL: @sle_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sle <2 x i8> %x, <i8 0, i8 0>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ule_zero(<2 x i8> %x) {
+; CHECK-LABEL: @ule_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp ule <2 x i8> %x, <i8 0, i8 0>
+  ret <2 x i1> %cmp
+}
+
+; Weird types are ConstantVectors, not ConstantDataVectors. For an i3 type:
+; Signed min = -4
+; Unsigned min = 0
+; Signed max = 3
+; Unsigned max = 7
+
+define <3 x i1> @sge_weird(<3 x i3> %x) {
+; CHECK-LABEL: @sge_weird(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i3> [[X:%.*]], <i3 -4, i3 2, i3 -1>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp sge <3 x i3> %x, <i3 -3, i3 -5, i3 0>
+  ret <3 x i1> %cmp
+}
+
+define <3 x i1> @uge_weird(<3 x i3> %x) {
+; CHECK-LABEL: @uge_weird(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i3> [[X:%.*]], <i3 -2, i3 0, i3 1>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp uge <3 x i3> %x, <i3 -1, i3 1, i3 2>
+  ret <3 x i1> %cmp
+}
+
+define <3 x i1> @sle_weird(<3 x i3> %x) {
+; CHECK-LABEL: @sle_weird(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i3> [[X:%.*]], <i3 3, i3 -3, i3 1>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp sle <3 x i3> %x, <i3 2, i3 4, i3 0>
+  ret <3 x i1> %cmp
+}
+
+define <3 x i1> @ule_weird(<3 x i3> %x) {
+; CHECK-LABEL: @ule_weird(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <3 x i3> [[X:%.*]], <i3 -1, i3 1, i3 2>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp ule <3 x i3> %x, <i3 6, i3 0, i3 1>
+  ret <3 x i1> %cmp
+}
+
+; We can't do the transform if any constants are already at the limits.
+
+define <2 x i1> @sge_min(<2 x i3> %x) {
+; CHECK-LABEL: @sge_min(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge <2 x i3> [[X:%.*]], <i3 -4, i3 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sge <2 x i3> %x, <i3 -4, i3 1>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @uge_min(<2 x i3> %x) {
+; CHECK-LABEL: @uge_min(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge <2 x i3> [[X:%.*]], <i3 1, i3 0>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp uge <2 x i3> %x, <i3 1, i3 0>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @sle_max(<2 x i3> %x) {
+; CHECK-LABEL: @sle_max(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle <2 x i3> [[X:%.*]], <i3 1, i3 3>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sle <2 x i3> %x, <i3 1, i3 3>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ule_max(<2 x i3> %x) {
+; CHECK-LABEL: @ule_max(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <2 x i3> [[X:%.*]], <i3 -1, i3 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp ule <2 x i3> %x, <i3 7, i3 1>
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @PR27756_1(<2 x i8> %a) {
+; CHECK-LABEL: @PR27756_1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[A:%.*]], <i8 34, i8 1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sle <2 x i8> %a, <i8 bitcast (<2 x i4> <i4 1, i4 2> to i8), i8 0>
+  ret <2 x i1> %cmp
+}
+
+; Undef elements don't prevent the transform of the comparison.
+
+define <3 x i1> @PR27756_2(<3 x i8> %a) {
+; CHECK-LABEL: @PR27756_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i8> [[A:%.*]], <i8 43, i8 43, i8 1>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp sle <3 x i8> %a, <i8 42, i8 undef, i8 0>
+  ret <3 x i1> %cmp
+}
+
+define <3 x i1> @PR27756_3(<3 x i8> %a) {
+; CHECK-LABEL: @PR27756_3(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i8> [[A:%.*]], <i8 0, i8 0, i8 41>
+; CHECK-NEXT:    ret <3 x i1> [[CMP]]
+;
+  %cmp = icmp sge <3 x i8> %a, <i8 undef, i8 1, i8 42>
+  ret <3 x i1> %cmp
+}
+
+ at someglobal = global i32 0
+
+define <2 x i1> @PR27786(<2 x i8> %a) {
+; CHECK-LABEL: @PR27786(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle <2 x i8> [[A:%.*]], bitcast (i16 ptrtoint (i32* @someglobal to i16) to <2 x i8>)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %cmp = icmp sle <2 x i8> %a, bitcast (i16 ptrtoint (i32* @someglobal to i16) to <2 x i8>)
+  ret <2 x i1> %cmp
+}
+
+; This is similar to a transform for shuffled binops: compare first, shuffle after.
+
+define <4 x i1> @same_shuffle_inputs_icmp(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @same_shuffle_inputs_icmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %shufx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> < i32 3, i32 3, i32 2, i32 0 >
+  %shufy = shufflevector <4 x i8> %y, <4 x i8> poison, <4 x i32> < i32 3, i32 3, i32 2, i32 0 >
+  %cmp = icmp sgt <4 x i8> %shufx, %shufy
+  ret <4 x i1> %cmp
+}
+
+; fcmp and size-changing shuffles are ok too.
+
+define <5 x i1> @same_shuffle_inputs_fcmp(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @same_shuffle_inputs_fcmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <5 x i32> <i32 0, i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    ret <5 x i1> [[CMP]]
+;
+  %shufx = shufflevector <4 x float> %x, <4 x float> poison, <5 x i32> < i32 0, i32 1, i32 3, i32 2, i32 0 >
+  %shufy = shufflevector <4 x float> %y, <4 x float> poison, <5 x i32> < i32 0, i32 1, i32 3, i32 2, i32 0 >
+  %cmp = fcmp oeq <5 x float> %shufx, %shufy
+  ret <5 x i1> %cmp
+}
+
+declare void @use_v4i8(<4 x i8>)
+
+define <4 x i1> @same_shuffle_inputs_icmp_extra_use1(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @same_shuffle_inputs_icmp_extra_use1(
+; CHECK-NEXT:    [[SHUFX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i8> [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    call void @use_v4i8(<4 x i8> [[SHUFX]])
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %shufx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
+  %shufy = shufflevector <4 x i8> %y, <4 x i8> poison, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
+  %cmp = icmp ugt <4 x i8> %shufx, %shufy
+  call void @use_v4i8(<4 x i8> %shufx)
+  ret <4 x i1> %cmp
+}
+
+declare void @use_v2i8(<2 x i8>)
+
+define <2 x i1> @same_shuffle_inputs_icmp_extra_use2(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @same_shuffle_inputs_icmp_extra_use2(
+; CHECK-NEXT:    [[SHUFY:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> poison, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    call void @use_v2i8(<2 x i8> [[SHUFY]])
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shufx = shufflevector <4 x i8> %x, <4 x i8> poison, <2 x i32> < i32 3, i32 2 >
+  %shufy = shufflevector <4 x i8> %y, <4 x i8> poison, <2 x i32> < i32 3, i32 2 >
+  %cmp = icmp eq <2 x i8> %shufx, %shufy
+  call void @use_v2i8(<2 x i8> %shufy)
+  ret <2 x i1> %cmp
+}
+
+; Negative test: if both shuffles have extra uses, don't transform because that would increase instruction count.
+
+define <2 x i1> @same_shuffle_inputs_icmp_extra_use3(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @same_shuffle_inputs_icmp_extra_use3(
+; CHECK-NEXT:    [[SHUFX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUFY:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i8> [[SHUFX]], [[SHUFY]]
+; CHECK-NEXT:    call void @use_v2i8(<2 x i8> [[SHUFX]])
+; CHECK-NEXT:    call void @use_v2i8(<2 x i8> [[SHUFY]])
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shufx = shufflevector <4 x i8> %x, <4 x i8> poison, <2 x i32> < i32 0, i32 0 >
+  %shufy = shufflevector <4 x i8> %y, <4 x i8> poison, <2 x i32> < i32 0, i32 0 >
+  %cmp = icmp eq <2 x i8> %shufx, %shufy
+  call void @use_v2i8(<2 x i8> %shufx)
+  call void @use_v2i8(<2 x i8> %shufy)
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @splat_icmp(<4 x i8> %x) {
+; CHECK-LABEL: @splat_icmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 42, i8 42>
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %cmp = icmp sgt <4 x i8> %splatx, <i8 42, i8 42, i8 42, i8 42>
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @splat_icmp_undef(<4 x i8> %x) {
+; CHECK-LABEL: @splat_icmp_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 42, i8 42>
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 2>
+  %cmp = icmp ult <4 x i8> %splatx, <i8 undef, i8 42, i8 undef, i8 42>
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @splat_icmp_larger_size(<2 x i8> %x) {
+; CHECK-LABEL: @splat_icmp_larger_size(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 42, i8 42>
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 1, i32 undef, i32 1, i32 undef>
+  %cmp = icmp eq <4 x i8> %splatx, <i8 42, i8 42, i8 undef, i8 42>
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @splat_fcmp_smaller_size(<5 x float> %x) {
+; CHECK-LABEL: @splat_fcmp_smaller_size(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <5 x float> [[X:%.*]], <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
+; CHECK-NEXT:    [[CMP:%.*]] = shufflevector <5 x i1> [[TMP1]], <5 x i1> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <5 x float> %x, <5 x float> poison, <4 x i32> <i32 1, i32 undef, i32 1, i32 undef>
+  %cmp = fcmp oeq <4 x float> %splatx, <float 42.0, float 42.0, float undef, float 42.0>
+  ret <4 x i1> %cmp
+}
+
+; Negative test
+
+define <4 x i1> @splat_icmp_extra_use(<4 x i8> %x) {
+; CHECK-LABEL: @splat_icmp_extra_use(
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    call void @use_v4i8(<4 x i8> [[SPLATX]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i8> [[SPLATX]], <i8 42, i8 42, i8 42, i8 42>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  call void @use_v4i8(<4 x i8> %splatx)
+  %cmp = icmp sgt <4 x i8> %splatx, <i8 42, i8 42, i8 42, i8 42>
+  ret <4 x i1> %cmp
+}
+
+; Negative test
+
+define <4 x i1> @not_splat_icmp(<4 x i8> %x) {
+; CHECK-LABEL: @not_splat_icmp(
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i8> [[SPLATX]], <i8 42, i8 42, i8 42, i8 42>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
+  %cmp = icmp sgt <4 x i8> %splatx, <i8 42, i8 42, i8 42, i8 42>
+  ret <4 x i1> %cmp
+}
+
+; Negative test
+
+define <4 x i1> @not_splat_icmp2(<4 x i8> %x) {
+; CHECK-LABEL: @not_splat_icmp2(
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i8> [[SPLATX]], <i8 43, i8 42, i8 42, i8 42>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %splatx = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %cmp = icmp sgt <4 x i8> %splatx, <i8 43, i8 42, i8 42, i8 42>
+  ret <4 x i1> %cmp
+}

diff  --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
index 64958b9ccd5a..5e102c48fbd6 100644
--- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
@@ -254,7 +254,7 @@ bb1:
   br label %bb2
 
 bb2:
-  %widen = shufflevector <2 x float> %x, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widen = shufflevector <2 x float> %x, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %ext2 = extractelement <4 x float> %widen, i32 0
   %ins1 = insertelement <4 x float> <float 0.0, float 0.0, float undef, float undef>, float %ext2, i32 2
   %ins2 = insertelement <4 x float> %ins1, float %ext1, i32 3
@@ -432,7 +432,7 @@ define <4 x float> @insert_nonzero_index_splat(float %x) {
 ; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 2
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
   ret <4 x float> %splat
 }
 
@@ -443,7 +443,7 @@ define <3 x double> @insert_nonzero_index_splat_narrow(double %x) {
 ; CHECK-NEXT:    ret <3 x double> [[SPLAT]]
 ;
   %xv = insertelement <4 x double> poison, double %x, i32 3
-  %splat = shufflevector <4 x double> %xv, <4 x double> undef, <3 x i32> <i32 3, i32 undef, i32 3>
+  %splat = shufflevector <4 x double> %xv, <4 x double> poison, <3 x i32> <i32 3, i32 undef, i32 3>
   ret <3 x double> %splat
 }
 
@@ -454,7 +454,7 @@ define <5 x i7> @insert_nonzero_index_splat_widen(i7 %x) {
 ; CHECK-NEXT:    ret <5 x i7> [[SPLAT]]
 ;
   %xv = insertelement <4 x i7> poison, i7 %x, i32 1
-  %splat = shufflevector <4 x i7> %xv, <4 x i7> undef, <5 x i32> <i32 undef, i32 1, i32 1, i32 undef, i32 1>
+  %splat = shufflevector <4 x i7> %xv, <4 x i7> poison, <5 x i32> <i32 undef, i32 1, i32 1, i32 undef, i32 1>
   ret <5 x i7> %splat
 }
 
@@ -464,12 +464,12 @@ define <4 x float> @insert_nonzero_index_splat_extra_use(float %x) {
 ; CHECK-LABEL: @insert_nonzero_index_splat_extra_use(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 2
 ; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 2
   call void @use(<4 x float> %xv)
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
   ret <4 x float> %splat
 }
 
@@ -478,11 +478,11 @@ define <4 x float> @insert_nonzero_index_splat_extra_use(float %x) {
 define <4 x float> @insert_nonzero_index_splat_wrong_base(float %x, <4 x float> %y) {
 ; CHECK-LABEL: @insert_nonzero_index_splat_wrong_base(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 2
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
 ;
   %xv = insertelement <4 x float> %y, float %x, i32 2
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
   ret <4 x float> %splat
 }
 
@@ -491,11 +491,11 @@ define <4 x float> @insert_nonzero_index_splat_wrong_base(float %x, <4 x float>
 define <4 x float> @insert_nonzero_index_splat_wrong_index(float %x, i32 %index) {
 ; CHECK-LABEL: @insert_nonzero_index_splat_wrong_index(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 [[INDEX:%.*]]
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 %index
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
   ret <4 x float> %splat
 }
 
@@ -506,7 +506,7 @@ define <4 x float> @insert_in_splat(float %x) {
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 0
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
   %r = insertelement <4 x float> %splat, float %x, i32 3
   ret <4 x float> %r
 }
@@ -515,14 +515,14 @@ define <4 x float> @insert_in_splat_extra_uses(float %x) {
 ; CHECK-LABEL: @insert_in_splat_extra_uses(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
 ; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
 ; CHECK-NEXT:    call void @use(<4 x float> [[SPLAT]])
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 0
   call void @use(<4 x float> %xv)
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
   call void @use(<4 x float> %splat)
   %r = insertelement <4 x float> %splat, float %x, i32 3
   ret <4 x float> %r
@@ -533,12 +533,12 @@ define <4 x float> @insert_in_splat_extra_uses(float %x) {
 define <4 x float> @insert_in_splat_variable_index(float %x, i32 %y) {
 ; CHECK-LABEL: @insert_in_splat_variable_index(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[SPLAT]], float [[X]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> poison, float %x, i32 0
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
   %r = insertelement <4 x float> %splat, float %x, i32 %y
   ret <4 x float> %r
 }
@@ -563,23 +563,23 @@ define <4 x float> @insert_in_nonsplat(float %x, <4 x float> %y) {
 define <4 x float> @insert_in_nonsplat2(float %x, <4 x float> %y) {
 ; CHECK-LABEL: @insert_in_nonsplat2(
 ; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[SPLAT]], float [[X]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %xv = insertelement <4 x float> %y, float %x, i32 0
-  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  %splat = shufflevector <4 x float> %xv, <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
   %r = insertelement <4 x float> %splat, float %x, i32 3
   ret <4 x float> %r
 }
 
 define <4 x i8> @shuf_identity_padding(<2 x i8> %x, i8 %y) {
 ; CHECK-LABEL: @shuf_identity_padding(
-; CHECK-NEXT:    [[V1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
 ; CHECK-NEXT:    ret <4 x i8> [[V2]]
 ;
-  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %x1 = extractelement <2 x i8> %x, i32 1
   %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 1
   %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
@@ -588,11 +588,11 @@ define <4 x i8> @shuf_identity_padding(<2 x i8> %x, i8 %y) {
 
 define <3 x i8> @shuf_identity_extract(<4 x i8> %x, i8 %y) {
 ; CHECK-LABEL: @shuf_identity_extract(
-; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 undef>
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <3 x i8> [[V1]], i8 [[Y:%.*]], i32 2
 ; CHECK-NEXT:    ret <3 x i8> [[V2]]
 ;
-  %v0 = shufflevector <4 x i8> %x, <4 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
+  %v0 = shufflevector <4 x i8> %x, <4 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
   %x1 = extractelement <4 x i8> %x, i32 1
   %v1 = insertelement <3 x i8> %v0, i8 %x1, i32 1
   %v2 = insertelement <3 x i8> %v1, i8 %y, i32 2
@@ -601,13 +601,13 @@ define <3 x i8> @shuf_identity_extract(<4 x i8> %x, i8 %y) {
 
 define <4 x float> @shuf_identity_extract_extra_use(<6 x float> %x, float %y) {
 ; CHECK-LABEL: @shuf_identity_extract_extra_use(
-; CHECK-NEXT:    [[V0:%.*]] = shufflevector <6 x float> [[X:%.*]], <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <6 x float> [[X:%.*]], <6 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
 ; CHECK-NEXT:    call void @use(<4 x float> [[V0]])
-; CHECK-NEXT:    [[V1:%.*]] = shufflevector <6 x float> [[X]], <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <6 x float> [[X]], <6 x float> poison, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[Y:%.*]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[V2]]
 ;
-  %v0 = shufflevector <6 x float> %x, <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+  %v0 = shufflevector <6 x float> %x, <6 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
   call void @use(<4 x float> %v0)
   %x1 = extractelement <6 x float> %x, i32 2
   %v1 = insertelement <4 x float> %v0, float %x1, i32 2
@@ -619,13 +619,13 @@ define <4 x float> @shuf_identity_extract_extra_use(<6 x float> %x, float %y) {
 
 define <4 x i8> @shuf_identity_padding_variable_index(<2 x i8> %x, i8 %y, i32 %index) {
 ; CHECK-LABEL: @shuf_identity_padding_variable_index(
-; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[X]], i32 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
 ; CHECK-NEXT:    ret <4 x i8> [[V2]]
 ;
-  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %x1 = extractelement <2 x i8> %x, i32 %index
   %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 %index
   %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
@@ -636,13 +636,13 @@ define <4 x i8> @shuf_identity_padding_variable_index(<2 x i8> %x, i8 %y, i32 %i
 
 define <4 x i8> @shuf_identity_padding_wrong_source_vec(<2 x i8> %x, i8 %y, <2 x i8> %other) {
 ; CHECK-LABEL: @shuf_identity_padding_wrong_source_vec(
-; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[OTHER:%.*]], i32 1
 ; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 1
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
 ; CHECK-NEXT:    ret <4 x i8> [[V2]]
 ;
-  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %x1 = extractelement <2 x i8> %other, i32 1
   %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 1
   %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
@@ -653,13 +653,13 @@ define <4 x i8> @shuf_identity_padding_wrong_source_vec(<2 x i8> %x, i8 %y, <2 x
 
 define <4 x i8> @shuf_identity_padding_wrong_index(<2 x i8> %x, i8 %y) {
 ; CHECK-LABEL: @shuf_identity_padding_wrong_index(
-; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[X]], i32 1
 ; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 2
 ; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 3
 ; CHECK-NEXT:    ret <4 x i8> [[V2]]
 ;
-  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %x1 = extractelement <2 x i8> %x, i32 1
   %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 2
   %v2 = insertelement <4 x i8> %v1, i8 %y, i32 3
@@ -729,7 +729,7 @@ define <4 x float> @splat_constant(<4 x float> %x) {
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %ins3 = insertelement <4 x float> %x, float 3.0, i32 3
-  %splat3 = shufflevector <4 x float> %ins3, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %splat3 = shufflevector <4 x float> %ins3, <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %r = fadd <4 x float> %ins3, %splat3
   ret <4 x float> %r
 }

diff  --git a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
new file mode 100644
index 000000000000..2f448ce1c740
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
@@ -0,0 +1,637 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[E:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[E]], i32 [[C:%.*]], i32 [[D:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %e = icmp slt i32 %a, %b
+  %f = sext i1 %e to i32
+  %g = and i32 %c, %f
+  %h = xor i32 %f, -1
+  %i = and i32 %d, %h
+  %j = or i32 %g, %i
+  ret i32 %j
+}
+
+define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[E_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[E_NOT]], i32 [[C:%.*]], i32 [[D:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %e = icmp slt i32 %a, %b
+  %f = sext i1 %e to i32
+  %g = and i32 %c, %f
+  %h = xor i32 %f, -1
+  %i = and i32 %d, %h
+  %j = or i32 %i, %g
+  ret i32 %j
+}
+
+define i32 @goo(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @goo(
+; CHECK-NEXT:    [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %t0 = icmp slt i32 %a, %b
+  %iftmp.0.0 = select i1 %t0, i32 -1, i32 0
+  %t1 = and i32 %iftmp.0.0, %c
+  %not = xor i32 %iftmp.0.0, -1
+  %t2 = and i32 %not, %d
+  %t3 = or i32 %t1, %t2
+  ret i32 %t3
+}
+
+define i32 @poo(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @poo(
+; CHECK-NEXT:    [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]]
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t0 = icmp slt i32 %a, %b
+  %iftmp.0.0 = select i1 %t0, i32 -1, i32 0
+  %t1 = and i32 %iftmp.0.0, %c
+  %iftmp = select i1 %t0, i32 0, i32 -1
+  %t2 = and i32 %iftmp, %d
+  %t3 = or i32 %t1, %t2
+  ret i32 %t3
+}
+
+; PR32791 - https://bugs.llvm.org//show_bug.cgi?id=32791
+; The 2nd compare/select are canonicalized, so CSE and another round of instcombine or some other pass will fold this.
+
+define i32 @fold_inverted_icmp_preds(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2_NOT]], i32 0, i32 [[D:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = icmp slt i32 %a, %b
+  %sel1 = select i1 %cmp1, i32 %c, i32 0
+  %cmp2 = icmp sge i32 %a, %b
+  %sel2 = select i1 %cmp2, i32 %d, i32 0
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+; The 2nd compare/select are canonicalized, so CSE and another round of instcombine or some other pass will fold this.
+
+define i32 @fold_inverted_icmp_preds_reverse(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds_reverse(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 0, i32 [[C:%.*]]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2_NOT]], i32 [[D:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = icmp slt i32 %a, %b
+  %sel1 = select i1 %cmp1, i32 0, i32 %c
+  %cmp2 = icmp sge i32 %a, %b
+  %sel2 = select i1 %cmp2, i32 0, i32 %d
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+; TODO: Should fcmp have the same sort of predicate canonicalization as icmp?
+
+define i32 @fold_inverted_fcmp_preds(float %a, float %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_fcmp_preds(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP1]], i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp uge float [[A]], [[B]]
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 [[D:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp1 = fcmp olt float %a, %b
+  %sel1 = select i1 %cmp1, i32 %c, i32 0
+  %cmp2 = fcmp uge float %a, %b
+  %sel2 = select i1 %cmp2, i32 %d, i32 0
+  %or = or i32 %sel1, %sel2
+  ret i32 %or
+}
+
+; The 2nd compare/select are canonicalized, so CSE and another round of instcombine or some other pass will fold this.
+
+define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: @fold_inverted_icmp_vector_preds(
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select <2 x i1> [[CMP1_NOT]], <2 x i32> zeroinitializer, <2 x i32> [[C:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq <2 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[SEL2:%.*]] = select <2 x i1> [[CMP2]], <2 x i32> [[D:%.*]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <2 x i32> [[OR]]
+;
+  %cmp1 = icmp ne <2 x i32> %a, %b
+  %sel1 = select <2 x i1> %cmp1, <2 x i32> %c, <2 x i32> <i32 0, i32 0>
+  %cmp2 = icmp eq <2 x i32> %a, %b
+  %sel2 = select <2 x i1> %cmp2, <2 x i32> %d, <2 x i32> <i32 0, i32 0>
+  %or = or <2 x i32> %sel1, %sel2
+  ret <2 x i32> %or
+}
+
+define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @par(
+; CHECK-NEXT:    [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %t0 = icmp slt i32 %a, %b
+  %iftmp.1.0 = select i1 %t0, i32 -1, i32 0
+  %t1 = and i32 %iftmp.1.0, %c
+  %not = xor i32 %iftmp.1.0, -1
+  %t2 = and i32 %not, %d
+  %t3 = or i32 %t1, %t2
+  ret i32 %t3
+}
+
+; In the following tests (8 commutation variants), verify that a bitcast doesn't get
+; in the way of a select transform. These bitcasts are common in SSE/AVX and possibly
+; other vector code because of canonicalization to i64 elements for vectors.
+
+; The fptosi instructions are included to avoid commutation canonicalization based on
+; operator weight. Using another cast operator ensures that both operands of all logic
+; ops are equally weighted, and this ensures that we're testing all commutation
+; possibilities.
+
+define <2 x i64> @bitcast_select_swap0(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap0(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %bc1, %sia
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %bc2, %sib
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap1(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap1(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %bc1, %sia
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %bc2, %sib
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap2(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap2(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %bc1, %sia
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap3(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap3(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %bc1, %sia
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap4(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap4(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %bc2, %sib
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap5(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap5(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %bc2, %sib
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap6(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap6(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_swap7(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @bitcast_select_swap7(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SIA]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[SIB]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <2 x i64> @bitcast_select_multi_uses(<4 x i1> %cmp, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @bitcast_select_multi_uses(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64>
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i64> [[BC1]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = xor <2 x i64> [[TMP1]], <i64 -1, i64 -1>
+; CHECK-NEXT:    [[AND2:%.*]] = and <2 x i64> [[BC2]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[AND2]], [[BC2]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[OR]], [[ADD]]
+; CHECK-NEXT:    ret <2 x i64> [[SUB]]
+;
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %bc1 = bitcast <4 x i32> %sext to <2 x i64>
+  %and1 = and <2 x i64> %a, %bc1
+  %neg = xor <4 x i32> %sext, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc2 = bitcast <4 x i32> %neg to <2 x i64>
+  %and2 = and <2 x i64> %b, %bc2
+  %or = or <2 x i64> %and2, %and1
+  %add = add <2 x i64> %and2, %bc2
+  %sub = sub <2 x i64> %or, %add
+  ret <2 x i64> %sub
+}
+
+define i1 @bools(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: @bools(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %not = xor i1 %c, -1
+  %and1 = and i1 %not, %a
+  %and2 = and i1 %c, %b
+  %or = or i1 %and1, %and2
+  ret i1 %or
+}
+
+; Form a select if we know we can get replace 2 simple logic ops.
+
+define i1 @bools_multi_uses1(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: @bools_multi_uses1(
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[C:%.*]], true
+; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[NOT]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], i1 [[B:%.*]], i1 [[A]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[TMP1]], [[AND1]]
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %not = xor i1 %c, -1
+  %and1 = and i1 %not, %a
+  %and2 = and i1 %c, %b
+  %or = or i1 %and1, %and2
+  %xor = xor i1 %or, %and1
+  ret i1 %xor
+}
+
+; Don't replace a cheap logic op with a potentially expensive select
+; unless we can also eliminate one of the other original ops.
+
+define i1 @bools_multi_uses2(i1 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: @bools_multi_uses2(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %not = xor i1 %c, -1
+  %and1 = and i1 %not, %a
+  %and2 = and i1 %c, %b
+  %or = or i1 %and1, %and2
+  %add = add i1 %and1, %and2
+  %and3 = and i1 %or, %add
+  ret i1 %and3
+}
+
+define <4 x i1> @vec_of_bools(<4 x i1> %a, <4 x i1> %b, <4 x i1> %c) {
+; CHECK-LABEL: @vec_of_bools(
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[C:%.*]], <4 x i1> [[B:%.*]], <4 x i1> [[A:%.*]]
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %not = xor <4 x i1> %c, <i1 true, i1 true, i1 true, i1 true>
+  %and1 = and <4 x i1> %not, %a
+  %and2 = and <4 x i1> %b, %c
+  %or = or <4 x i1> %and2, %and1
+  ret <4 x i1> %or
+}
+
+define i4 @vec_of_casted_bools(i4 %a, i4 %b, <4 x i1> %c) {
+; CHECK-LABEL: @vec_of_casted_bools(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i4 [[A:%.*]] to <4 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i4 [[B:%.*]] to <4 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[C:%.*]], <4 x i1> [[TMP2]], <4 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i1> [[TMP3]] to i4
+; CHECK-NEXT:    ret i4 [[TMP4]]
+;
+  %not = xor <4 x i1> %c, <i1 true, i1 true, i1 true, i1 true>
+  %bc1 = bitcast <4 x i1> %not to i4
+  %bc2 = bitcast <4 x i1> %c to i4
+  %and1 = and i4 %a, %bc1
+  %and2 = and i4 %bc2, %b
+  %or = or i4 %and1, %and2
+  ret i4 %or
+}
+
+; Inverted 'and' constants mean this is a select which is canonicalized to a shuffle.
+
+define <4 x i32> @vec_sel_consts(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_sel_consts(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+define <3 x i129> @vec_sel_consts_weird(<3 x i129> %a, <3 x i129> %b) {
+; CHECK-LABEL: @vec_sel_consts_weird(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i129> [[A:%.*]], <3 x i129> [[B:%.*]], <3 x i32> <i32 0, i32 4, i32 2>
+; CHECK-NEXT:    ret <3 x i129> [[TMP1]]
+;
+  %and1 = and <3 x i129> %a, <i129 -1, i129 0, i129 -1>
+  %and2 = and <3 x i129> %b, <i129 0, i129 -1, i129 0>
+  %or = or <3 x i129> %and2, %and1
+  ret <3 x i129> %or
+}
+
+; The mask elements must be inverted for this to be a select.
+
+define <4 x i32> @vec_not_sel_consts(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_not_sel_consts(
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[A:%.*]], <i32 -1, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[B:%.*]], <i32 0, i32 -1, i32 0, i32 -1>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 0, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @vec_not_sel_consts_undef_elts(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_not_sel_consts_undef_elts(
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i32> [[A:%.*]], <i32 -1, i32 undef, i32 0, i32 0>
+; CHECK-NEXT:    [[AND2:%.*]] = and <4 x i32> [[B:%.*]], <i32 0, i32 -1, i32 0, i32 undef>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[AND1]], [[AND2]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %and1 = and <4 x i32> %a, <i32 -1, i32 undef, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 0, i32 undef>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+; The inverted constants may be operands of xor instructions.
+
+define <4 x i32> @vec_sel_xor(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c) {
+; CHECK-LABEL: @vec_sel_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[C:%.*]], <i1 false, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %mask_flip1 = xor <4 x i32> %mask, <i32 -1, i32 0, i32 0, i32 0>
+  %not_mask_flip1 = xor <4 x i32> %mask, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %not_mask_flip1, %a
+  %and2 = and <4 x i32> %mask_flip1, %b
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+; Allow the transform even if the mask values have multiple uses because
+; there's still a net reduction of instructions from removing the and/and/or.
+
+define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c) {
+; CHECK-LABEL: @vec_sel_xor_multi_use(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[C:%.*]], <i1 true, i1 false, i1 false, i1 false>
+; CHECK-NEXT:    [[MASK_FLIP1:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[C]], <i1 false, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[TMP3]], [[MASK_FLIP1]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD]]
+;
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %mask_flip1 = xor <4 x i32> %mask, <i32 -1, i32 0, i32 0, i32 0>
+  %not_mask_flip1 = xor <4 x i32> %mask, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %not_mask_flip1, %a
+  %and2 = and <4 x i32> %mask_flip1, %b
+  %or = or <4 x i32> %and1, %and2
+  %add = add <4 x i32> %or, %mask_flip1
+  ret <4 x i32> %add
+}
+
+; The 'ashr' guarantees that we have a bitmask, so this is select with truncated condition.
+
+define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
+; CHECK-LABEL: @allSignBits(
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTNOT]], i32 [[TVAL:%.*]], i32 [[FVAL:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %bitmask = ashr i32 %cond, 31
+  %not_bitmask = xor i32 %bitmask, -1
+  %a1 = and i32 %tval, %bitmask
+  %a2 = and i32 %not_bitmask, %fval
+  %sel = or i32 %a1, %a2
+  ret i32 %sel
+}
+
+define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
+; CHECK-LABEL: @allSignBits_vec(
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp sgt <4 x i8> [[COND:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i8> [[FVAL:%.*]], <4 x i8> [[TVAL:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP1]]
+;
+  %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
+  %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
+  %a1 = and <4 x i8> %tval, %bitmask
+  %a2 = and <4 x i8> %fval, %not_bitmask
+  %sel = or <4 x i8> %a2, %a1
+  ret <4 x i8> %sel
+}
+
+; Negative test - make sure that bitcasts from FP do not cause a crash.
+
+define <2 x i64> @fp_bitcast(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @fp_bitcast(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i64> [[SIA]], [[BC1]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[AND2:%.*]] = and <2 x i64> [[SIB]], [[BC2]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]]
+; CHECK-NEXT:    ret <2 x i64> [[OR]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %bc1 = bitcast <2 x double> %a to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %bc2 = bitcast <2 x double> %b to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: @computesignbits_through_shuffles(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR1:%.*]] = or <4 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR2:%.*]] = or <4 x i32> [[S3]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[SHUF_OR2]] to <4 x i1>
+; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[Z:%.*]], <4 x float> [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[DOTV]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %s1 = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s2 = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or1 = or <4 x i32> %s1, %s2
+  %s3 = shufflevector <4 x i32> %shuf_or1, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s4 = shufflevector <4 x i32> %shuf_or1, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or2 = or <4 x i32> %s3, %s4
+  %not_or2 = xor <4 x i32> %shuf_or2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %xbc = bitcast <4 x float> %x to <4 x i32>
+  %zbc = bitcast <4 x float> %z to <4 x i32>
+  %and1 = and <4 x i32> %not_or2, %xbc
+  %and2 = and <4 x i32> %shuf_or2, %zbc
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
+; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %sext1 = sext <4 x i1> %cond1 to <4 x i32>
+  %sext2 = sext <4 x i1> %cond2 to <4 x i32>
+  %cond = shufflevector <4 x i32> %sext1, <4 x i32> %sext2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %notcond = xor <4 x i32> %cond, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %notcond, %x
+  %and2 = and <4 x i32> %cond, %y
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+

diff  --git a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
index 4243e823e257..501dc70c46a7 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
@@ -202,13 +202,13 @@ define <4 x double> @gather_lane2(double* %base, double %pt)  {
 ; CHECK-LABEL: @gather_lane2(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
-; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %ptrs = getelementptr double, double *%base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
   %pt_v1 = insertelement <4 x double> poison, double %pt, i64 0
-  %pt_v2 = shufflevector <4 x double> %pt_v1, <4 x double> undef, <4 x i32> zeroinitializer
+  %pt_v2 = shufflevector <4 x double> %pt_v1, <4 x double> poison, <4 x i32> zeroinitializer
   %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %pt_v2)
   ret <4 x double> %res
 }

diff  --git a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
new file mode 100644
index 000000000000..602772800d3a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
@@ -0,0 +1,1108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare i32 @llvm.abs.i32(i32, i1)
+
+define i32 @pow2_multiplier(i32 %A) {
+; CHECK-LABEL: @pow2_multiplier(
+; CHECK-NEXT:    [[B:%.*]] = shl i32 [[A:%.*]], 1
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %B = mul i32 %A, 2
+  ret i32 %B
+}
+
+define <2 x i32> @pow2_multiplier_vec(<2 x i32> %A) {
+; CHECK-LABEL: @pow2_multiplier_vec(
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i32> [[A:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[B]]
+;
+  %B = mul <2 x i32> %A, <i32 8, i32 8>
+  ret <2 x i32> %B
+}
+
+define i8 @combine_shl(i8 %A) {
+; CHECK-LABEL: @combine_shl(
+; CHECK-NEXT:    [[C:%.*]] = shl i8 [[A:%.*]], 6
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %B = mul i8 %A, 8
+  %C = mul i8 %B, 8
+  ret i8 %C
+}
+
+define i32 @neg(i32 %i) {
+; CHECK-LABEL: @neg(
+; CHECK-NEXT:    [[T:%.*]] = sub i32 0, [[I:%.*]]
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %t = mul i32 %i, -1
+  ret i32 %t
+}
+
+; Use the sign-bit as a mask:
+; (zext (A < 0)) * B --> (A >> 31) & B
+
+define i32 @test10(i32 %a, i32 %b) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %c = icmp slt i32 %a, 0
+  %d = zext i1 %c to i32
+  %e = mul i32 %d, %b
+  ret i32 %e
+}
+
+define i32 @test11(i32 %a, i32 %b) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %c = icmp sle i32 %a, -1
+  %d = zext i1 %c to i32
+  %e = mul i32 %d, %b
+  ret i32 %e
+}
+
+declare void @use32(i32)
+
+define i32 @test12(i32 %a, i32 %b) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = lshr i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A]], 31
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[A_LOBIT]])
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %c = icmp ugt i32 %a, 2147483647
+  %d = zext i1 %c to i32
+  %e = mul i32 %d, %b
+  call void @use32(i32 %d)
+  ret i32 %e
+}
+
+; rdar://7293527
+define i32 @test15(i32 %A, i32 %B) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[M:%.*]] = shl i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %shl = shl i32 1, %B
+  %m = mul i32 %shl, %A
+  ret i32 %m
+}
+
+; X * Y (when Y is a boolean) --> Y ? X : 0
+
+define i32 @mul_bool(i32 %x, i1 %y) {
+; CHECK-LABEL: @mul_bool(
+; CHECK-NEXT:    [[M:%.*]] = select i1 [[Y:%.*]], i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %z = zext i1 %y to i32
+  %m = mul i32 %x, %z
+  ret i32 %m
+}
+
+; Commute and test vector type.
+
+define <2 x i32> @mul_bool_vec(<2 x i32> %x, <2 x i1> %y) {
+; CHECK-LABEL: @mul_bool_vec(
+; CHECK-NEXT:    [[M:%.*]] = select <2 x i1> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i32> [[M]]
+;
+  %z = zext <2 x i1> %y to <2 x i32>
+  %m = mul <2 x i32> %x, %z
+  ret <2 x i32> %m
+}
+
+define <2 x i32> @mul_bool_vec_commute(<2 x i32> %x, <2 x i1> %y) {
+; CHECK-LABEL: @mul_bool_vec_commute(
+; CHECK-NEXT:    [[M:%.*]] = select <2 x i1> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i32> [[M]]
+;
+  %z = zext <2 x i1> %y to <2 x i32>
+  %m = mul <2 x i32> %z, %x
+  ret <2 x i32> %m
+}
+
+define <3 x i7> @mul_bools(<3 x i1> %x, <3 x i1> %y) {
+; CHECK-LABEL: @mul_bools(
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and <3 x i1> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext <3 x i1> [[MULBOOL]] to <3 x i7>
+; CHECK-NEXT:    ret <3 x i7> [[R]]
+;
+  %zx = zext <3 x i1> %x to <3 x i7>
+  %zy = zext <3 x i1> %y to <3 x i7>
+  %r = mul <3 x i7> %zx, %zy
+  ret <3 x i7> %r
+}
+
+define i32 @mul_bools_use1(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_use1(
+; CHECK-NEXT:    [[ZY:%.*]] = zext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %zx = zext i1 %x to i32
+  %zy = zext i1 %y to i32
+  call void @use32(i32 %zy)
+  %r = mul i32 %zx, %zy
+  ret i32 %r
+}
+
+define i32 @mul_bools_use2(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_use2(
+; CHECK-NEXT:    [[ZY:%.*]] = zext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %zx = zext i1 %x to i32
+  %zy = zext i1 %y to i32
+  call void @use32(i32 %zy)
+  %r = mul i32 %zy, %zx
+  ret i32 %r
+}
+
+define i32 @mul_bools_use3(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_use3(
+; CHECK-NEXT:    [[ZX:%.*]] = zext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZX]])
+; CHECK-NEXT:    [[ZY:%.*]] = zext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZY]])
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[X]], i32 [[ZY]], i32 0
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %zx = zext i1 %x to i32
+  call void @use32(i32 %zx)
+  %zy = zext i1 %y to i32
+  call void @use32(i32 %zy)
+  %r = mul i32 %zx, %zy
+  ret i32 %r
+}
+
+define <3 x i32> @mul_bools_sext(<3 x i1> %x, <3 x i1> %y) {
+; CHECK-LABEL: @mul_bools_sext(
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and <3 x i1> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext <3 x i1> [[MULBOOL]] to <3 x i32>
+; CHECK-NEXT:    ret <3 x i32> [[R]]
+;
+  %sx = sext <3 x i1> %x to <3 x i32>
+  %sy = sext <3 x i1> %y to <3 x i32>
+  %r = mul <3 x i32> %sx, %sy
+  ret <3 x i32> %r
+}
+
+define i32 @mul_bools_sext_use1(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_sext_use1(
+; CHECK-NEXT:    [[SY:%.*]] = sext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sx = sext i1 %x to i32
+  %sy = sext i1 %y to i32
+  call void @use32(i32 %sy)
+  %r = mul i32 %sx, %sy
+  ret i32 %r
+}
+
+define i32 @mul_bools_sext_use2(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_sext_use2(
+; CHECK-NEXT:    [[SY:%.*]] = sext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sx = sext i1 %x to i32
+  %sy = sext i1 %y to i32
+  call void @use32(i32 %sy)
+  %r = mul i32 %sy, %sx
+  ret i32 %r
+}
+
+define i32 @mul_bools_sext_use3(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_sext_use3(
+; CHECK-NEXT:    [[SX:%.*]] = sext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SX]])
+; CHECK-NEXT:    [[SY:%.*]] = sext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SY]])
+; CHECK-NEXT:    [[R:%.*]] = mul nsw i32 [[SY]], [[SX]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sx = sext i1 %x to i32
+  call void @use32(i32 %sx)
+  %sy = sext i1 %y to i32
+  call void @use32(i32 %sy)
+  %r = mul i32 %sy, %sx
+  ret i32 %r
+}
+
+define <3 x i32> @mul_bools_mixed_ext(<3 x i1> %x, <3 x i1> %y) {
+; CHECK-LABEL: @mul_bools_mixed_ext(
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and <3 x i1> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <3 x i1> [[MULBOOL]] to <3 x i32>
+; CHECK-NEXT:    ret <3 x i32> [[R]]
+;
+  %zx = zext <3 x i1> %x to <3 x i32>
+  %sy = sext <3 x i1> %y to <3 x i32>
+  %r = mul <3 x i32> %zx, %sy
+  ret <3 x i32> %r
+}
+
+define i32 @mul_bools_mixed_ext_use1(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_mixed_ext_use1(
+; CHECK-NEXT:    [[ZY:%.*]] = zext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sx = sext i1 %x to i32
+  %zy = zext i1 %y to i32
+  call void @use32(i32 %zy)
+  %r = mul i32 %sx, %zy
+  ret i32 %r
+}
+
+define i32 @mul_bools_mixed_ext_use2(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_mixed_ext_use2(
+; CHECK-NEXT:    [[SY:%.*]] = sext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SY]])
+; CHECK-NEXT:    [[MULBOOL:%.*]] = and i1 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext i1 [[MULBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %zx = zext i1 %x to i32
+  %sy = sext i1 %y to i32
+  call void @use32(i32 %sy)
+  %r = mul i32 %sy, %zx
+  ret i32 %r
+}
+
+define i32 @mul_bools_mixed_ext_use3(i1 %x, i1 %y) {
+; CHECK-LABEL: @mul_bools_mixed_ext_use3(
+; CHECK-NEXT:    [[SX:%.*]] = sext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[SX]])
+; CHECK-NEXT:    [[ZY:%.*]] = zext i1 [[Y:%.*]] to i32
+; CHECK-NEXT:    call void @use32(i32 [[ZY]])
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[Y]], i32 [[SX]], i32 0
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sx = sext i1 %x to i32
+  call void @use32(i32 %sx)
+  %zy = zext i1 %y to i32
+  call void @use32(i32 %zy)
+  %r = mul i32 %zy, %sx
+  ret i32 %r
+}
+
+; (A >>u 31) * B --> (A >>s 31) & B
+
+define i32 @signbit_mul(i32 %a, i32 %b) {
+; CHECK-LABEL: @signbit_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %d = lshr i32 %a, 31
+  %e = mul i32 %d, %b
+  ret i32 %e
+}
+
+define i32 @signbit_mul_commute_extra_use(i32 %a, i32 %b) {
+; CHECK-LABEL: @signbit_mul_commute_extra_use(
+; CHECK-NEXT:    [[D:%.*]] = lshr i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A]], 31
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[D]])
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %d = lshr i32 %a, 31
+  %e = mul i32 %b, %d
+  call void @use32(i32 %d)
+  ret i32 %e
+}
+
+; (A >>u 31)) * B --> (A >>s 31) & B
+
+define <2 x i32> @signbit_mul_vec(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @signbit_mul_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i32> [[A:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[E]]
+;
+  %d = lshr <2 x i32> %a, <i32 31, i32 31>
+  %e = mul <2 x i32> %d, %b
+  ret <2 x i32> %e
+}
+
+define <2 x i32> @signbit_mul_vec_commute(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @signbit_mul_vec_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i32> [[A:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[E]]
+;
+  %d = lshr <2 x i32> %a, <i32 31, i32 31>
+  %e = mul <2 x i32> %b, %d
+  ret <2 x i32> %e
+}
+
+define i32 @test18(i32 %A, i32 %B) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    ret i32 0
+;
+  %C = and i32 %A, 1
+  %D = and i32 %B, 1
+  %E = mul i32 %C, %D
+  %F = and i32 %E, 16
+  ret i32 %F
+}
+
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
+declare void @use(i1)
+
+define i32 @test19(i32 %A, i32 %B) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    ret i32 0
+;
+  %C = and i32 %A, 1
+  %D = and i32 %B, 1
+
+; It would be nice if we also started proving that this doesn't overflow.
+  %E = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %C, i32 %D)
+  %F = extractvalue {i32, i1} %E, 0
+  %G = extractvalue {i32, i1} %E, 1
+  call void @use(i1 %G)
+  %H = and i32 %F, 16
+  ret i32 %H
+}
+
+define <2 x i64> @test20(<2 x i64> %A) {
+; CHECK-LABEL: @test20(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i64> [[A:%.*]], <i64 3, i64 2>
+; CHECK-NEXT:    [[C:%.*]] = add <2 x i64> [[TMP1]], <i64 36, i64 28>
+; CHECK-NEXT:    ret <2 x i64> [[C]]
+;
+  %B = add <2 x i64> %A, <i64 12, i64 14>
+  %C = mul <2 x i64> %B, <i64 3, i64 2>
+  ret <2 x i64> %C
+}
+
+define <2 x i1> @test21(<2 x i1> %A, <2 x i1> %B) {
+; CHECK-LABEL: @test21(
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i1> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %C = mul <2 x i1> %A, %B
+  ret <2 x i1> %C
+}
+
+define i32 @test22(i32 %A) {
+; CHECK-LABEL: @test22(
+; CHECK-NEXT:    [[B:%.*]] = sub nsw i32 0, [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %B = mul nsw i32 %A, -1
+  ret i32 %B
+}
+
+define i32 @test23(i32 %A) {
+; CHECK-LABEL: @test23(
+; CHECK-NEXT:    [[C:%.*]] = mul nuw i32 [[A:%.*]], 6
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = shl nuw i32 %A, 1
+  %C = mul nuw i32 %B, 3
+  ret i32 %C
+}
+
+define i32 @test24(i32 %A) {
+; CHECK-LABEL: @test24(
+; CHECK-NEXT:    [[C:%.*]] = mul nsw i32 [[A:%.*]], 6
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = shl nsw i32 %A, 1
+  %C = mul nsw i32 %B, 3
+  ret i32 %C
+}
+
+define i32 @neg_neg_mul(i32 %A, i32 %B) {
+; CHECK-LABEL: @neg_neg_mul(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %C = sub i32 0, %A
+  %D = sub i32 0, %B
+  %E = mul i32 %C, %D
+  ret i32 %E
+}
+
+define i32 @neg_neg_mul_nsw(i32 %A, i32 %B) {
+; CHECK-LABEL: @neg_neg_mul_nsw(
+; CHECK-NEXT:    [[E:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %C = sub nsw i32 0, %A
+  %D = sub nsw i32 0, %B
+  %E = mul nsw i32 %C, %D
+  ret i32 %E
+}
+
+define i124 @neg_neg_mul_apint(i124 %A, i124 %B) {
+; CHECK-LABEL: @neg_neg_mul_apint(
+; CHECK-NEXT:    [[E:%.*]] = mul i124 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i124 [[E]]
+;
+  %C = sub i124 0, %A
+  %D = sub i124 0, %B
+  %E = mul i124 %C, %D
+  ret i124 %E
+}
+
+define i32 @neg_mul_constant(i32 %A) {
+; CHECK-LABEL: @neg_mul_constant(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], -7
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %C = sub i32 0, %A
+  %E = mul i32 %C, 7
+  ret i32 %E
+}
+
+define i55 @neg_mul_constant_apint(i55 %A) {
+; CHECK-LABEL: @neg_mul_constant_apint(
+; CHECK-NEXT:    [[E:%.*]] = mul i55 [[A:%.*]], -7
+; CHECK-NEXT:    ret i55 [[E]]
+;
+  %C = sub i55 0, %A
+  %E = mul i55 %C, 7
+  ret i55 %E
+}
+
+define <3 x i8> @neg_mul_constant_vec(<3 x i8> %a) {
+; CHECK-LABEL: @neg_mul_constant_vec(
+; CHECK-NEXT:    [[B:%.*]] = mul <3 x i8> [[A:%.*]], <i8 -5, i8 -5, i8 -5>
+; CHECK-NEXT:    ret <3 x i8> [[B]]
+;
+  %A = sub <3 x i8> zeroinitializer, %a
+  %B = mul <3 x i8> %A, <i8 5, i8 5, i8 5>
+  ret <3 x i8> %B
+}
+
+define <3 x i4> @neg_mul_constant_vec_weird(<3 x i4> %a) {
+; CHECK-LABEL: @neg_mul_constant_vec_weird(
+; CHECK-NEXT:    [[B:%.*]] = mul <3 x i4> [[A:%.*]], <i4 -5, i4 -5, i4 -5>
+; CHECK-NEXT:    ret <3 x i4> [[B]]
+;
+  %A = sub <3 x i4> zeroinitializer, %a
+  %B = mul <3 x i4> %A, <i4 5, i4 5, i4 5>
+  ret <3 x i4> %B
+}
+
+define i32 @test26(i32 %A, i32 %B) {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT:    [[D:%.*]] = shl nsw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %C = shl nsw i32 1, %B
+  %D = mul nsw i32 %A, %C
+  ret i32 %D
+}
+
+define i32 @test27(i32 %A, i32 %B) {
+; CHECK-LABEL: @test27(
+; CHECK-NEXT:    [[D:%.*]] = shl nuw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %C = shl i32 1, %B
+  %D = mul nuw i32 %A, %C
+  ret i32 %D
+}
+
+define i32 @test28(i32 %A) {
+; CHECK-LABEL: @test28(
+; CHECK-NEXT:    [[B:%.*]] = shl i32 1, [[A:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[B]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = shl i32 1, %A
+  %C = mul nsw i32 %B, %B
+  ret i32 %C
+}
+
+define i64 @test29(i31 %A, i31 %B) {
+; CHECK-LABEL: @test29(
+; CHECK-NEXT:    [[C:%.*]] = sext i31 [[A:%.*]] to i64
+; CHECK-NEXT:    [[D:%.*]] = sext i31 [[B:%.*]] to i64
+; CHECK-NEXT:    [[E:%.*]] = mul nsw i64 [[C]], [[D]]
+; CHECK-NEXT:    ret i64 [[E]]
+;
+  %C = sext i31 %A to i64
+  %D = sext i31 %B to i64
+  %E = mul i64 %C, %D
+  ret i64 %E
+}
+
+define i64 @test30(i32 %A, i32 %B) {
+; CHECK-LABEL: @test30(
+; CHECK-NEXT:    [[C:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[D:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[E:%.*]] = mul nuw i64 [[C]], [[D]]
+; CHECK-NEXT:    ret i64 [[E]]
+;
+  %C = zext i32 %A to i64
+  %D = zext i32 %B to i64
+  %E = mul i64 %C, %D
+  ret i64 %E
+}
+
+ at PR22087 = external global i32
+define i32 @test31(i32 %V) {
+; CHECK-LABEL: @test31(
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[V:%.*]], zext (i1 icmp ne (i32* inttoptr (i64 1 to i32*), i32* @PR22087) to i32)
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %mul = mul i32 %V, shl (i32 1, i32 zext (i1 icmp ne (i32* inttoptr (i64 1 to i32*), i32* @PR22087) to i32))
+  ret i32 %mul
+}
+
+define i32 @test32(i32 %X) {
+; CHECK-LABEL: @test32(
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[X:%.*]], 31
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %mul = mul nsw i32 %X, -2147483648
+  ret i32 %mul
+}
+
+define <2 x i32> @test32vec(<2 x i32> %X) {
+; CHECK-LABEL: @test32vec(
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %mul = mul nsw <2 x i32> %X, <i32 -2147483648, i32 -2147483648>
+  ret <2 x i32> %mul
+}
+
+define i32 @test33(i32 %X) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[X:%.*]], 30
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %mul = mul nsw i32 %X, 1073741824
+  ret i32 %mul
+}
+
+define <2 x i32> @test33vec(<2 x i32> %X) {
+; CHECK-LABEL: @test33vec(
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw <2 x i32> [[X:%.*]], <i32 30, i32 30>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %mul = mul nsw <2 x i32> %X, <i32 1073741824, i32 1073741824>
+  ret <2 x i32> %mul
+}
+
+define i128 @test34(i128 %X) {
+; CHECK-LABEL: @test34(
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i128 [[X:%.*]], 1
+; CHECK-NEXT:    ret i128 [[MUL]]
+;
+  %mul = mul nsw i128 %X, 2
+  ret i128 %mul
+}
+
+define i32 @test_mul_canonicalize_op0(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_canonicalize_op0(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_canonicalize_op1(i32 %x, i32 %z) {
+; CHECK-LABEL: @test_mul_canonicalize_op1(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = mul i32 [[Z:%.*]], -3
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul i32 [[Y_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[DOTNEG]]
+;
+  %y = mul i32 %z, 3
+  %neg = sub i32 0, %x
+  %mul = mul i32 %y, %neg
+  ret i32 %mul
+}
+
+define i32 @test_mul_canonicalize_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_canonicalize_nsw(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub nsw i32 0, %x
+  %mul = mul nsw i32 %neg, %y
+  ret i32 %mul
+}
+
+define <2 x i32> @test_mul_canonicalize_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test_mul_canonicalize_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub <2 x i32> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %neg = sub <2 x i32> <i32 0, i32 0>, %x
+  %mul = mul <2 x i32> %neg, %y
+  ret <2 x i32> %mul
+}
+
+define i32 @test_mul_canonicalize_multiple_uses(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_canonicalize_multiple_uses(
+; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  %mul2 = mul i32 %mul, %neg
+  ret i32 %mul2
+}
+
+ at X = global i32 5
+
+define i64 @test_mul_canonicalize_neg_is_not_undone(i64 %L1) {
+; Check we do not undo the canonicalization of 0 - (X * Y), if Y is a constant
+; expr.
+; CHECK-LABEL: @test_mul_canonicalize_neg_is_not_undone(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[L1:%.*]], ptrtoint (i32* @X to i64)
+; CHECK-NEXT:    [[B4:%.*]] = sub i64 0, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[B4]]
+;
+  %v1 = ptrtoint i32* @X to i64
+  %B8 = sub i64 0, %v1
+  %B4 = mul i64 %B8, %L1
+  ret i64 %B4
+}
+
+define i32 @negate_if_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[X]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sel = select i1 %cond, i32 -1, i32 1
+  %r = mul i32 %sel, %x
+  ret i32 %r
+}
+
+define i32 @negate_if_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i32 [[X]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sel = select i1 %cond, i32 1, i32 -1
+  %r = mul i32 %sel, %x
+  ret i32 %r
+}
+
+define <2 x i8> @negate_if_true_commute(<2 x i8> %px, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_commute(
+; CHECK-NEXT:    [[X:%.*]] = sdiv <2 x i8> <i8 42, i8 42>, [[PX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], <2 x i8> [[TMP1]], <2 x i8> [[X]]
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %x = sdiv <2 x i8> <i8 42, i8 42>, %px  ; thwart complexity-based canonicalization
+  %sel = select i1 %cond, <2 x i8> <i8 -1, i8 -1>, <2 x i8> <i8 1, i8 1>
+  %r = mul <2 x i8> %x, %sel
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) {
+; CHECK-LABEL: @negate_if_false_commute(
+; CHECK-NEXT:    [[X:%.*]] = sdiv <2 x i8> <i8 42, i8 5>, [[PX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i8> [[X]], <2 x i8> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %x = sdiv <2 x i8> <i8 42, i8 5>, %px  ; thwart complexity-based canonicalization
+  %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 undef>, <2 x i8> <i8 -1, i8 -1>
+  %r = mul <2 x i8> %x, %sel
+  ret <2 x i8> %r
+}
+
+; Negative test
+
+define i32 @negate_if_true_extra_use(i32 %x, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_extra_use(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i32 -1, i32 1
+; CHECK-NEXT:    call void @use32(i32 [[SEL]])
+; CHECK-NEXT:    [[R:%.*]] = mul i32 [[SEL]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %sel = select i1 %cond, i32 -1, i32 1
+  call void @use32(i32 %sel)
+  %r = mul i32 %sel, %x
+  ret i32 %r
+}
+
+; Negative test
+
+define <2 x i8> @negate_if_true_wrong_constant(<2 x i8> %px, i1 %cond) {
+; CHECK-LABEL: @negate_if_true_wrong_constant(
+; CHECK-NEXT:    [[X:%.*]] = sdiv <2 x i8> <i8 42, i8 42>, [[PX:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], <2 x i8> <i8 -1, i8 0>, <2 x i8> <i8 1, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[X]], [[SEL]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %x = sdiv <2 x i8> <i8 42, i8 42>, %px  ; thwart complexity-based canonicalization
+  %sel = select i1 %cond, <2 x i8> <i8 -1, i8 0>, <2 x i8> <i8 1, i8 1>
+  %r = mul <2 x i8> %x, %sel
+  ret <2 x i8> %r
+}
+
+; (C ? (X /exact Y) : 1) * Y -> C ? X : Y
+define i32 @mul_div_select(i32 %x, i32 %y, i1 %c) {
+; CHECK-LABEL: @mul_div_select(
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %div = udiv exact i32 %x, %y
+  %sel = select i1 %c, i32 %div, i32 1
+  %mul = mul i32 %sel, %y
+  ret i32 %mul
+}
+
+; fold mul(abs(x),abs(x)) -> mul(x,x)
+define i31 @combine_mul_abs_i31(i31 %0) {
+; CHECK-LABEL: @combine_mul_abs_i31(
+; CHECK-NEXT:    [[M:%.*]] = mul i31 [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i31 [[M]]
+;
+  %c = icmp slt i31 %0, 0
+  %s = sub nsw i31 0, %0
+  %r = select i1 %c, i31 %s, i31 %0
+  %m = mul i31 %r, %r
+  ret i31 %m
+}
+
+define i32 @combine_mul_abs_i32(i32 %0) {
+; CHECK-LABEL: @combine_mul_abs_i32(
+; CHECK-NEXT:    [[M:%.*]] = mul i32 [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %c = icmp slt i32 %0, 0
+  %s = sub nsw i32 0, %0
+  %r = select i1 %c, i32 %s, i32 %0
+  %m = mul i32 %r, %r
+  ret i32 %m
+}
+
+define <4 x i32> @combine_mul_abs_v4i32(<4 x i32> %0) {
+; CHECK-LABEL: @combine_mul_abs_v4i32(
+; CHECK-NEXT:    [[M:%.*]] = mul <4 x i32> [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret <4 x i32> [[M]]
+;
+  %c = icmp slt <4 x i32> %0, zeroinitializer
+  %s = sub nsw <4 x i32> zeroinitializer, %0
+  %r = select <4 x i1> %c, <4 x i32> %s, <4 x i32> %0
+  %m = mul <4 x i32> %r, %r
+  ret <4 x i32> %m
+}
+
+; fold mul(nabs(x),nabs(x)) -> mul(x,x)
+define i31 @combine_mul_nabs_i31(i31 %0) {
+; CHECK-LABEL: @combine_mul_nabs_i31(
+; CHECK-NEXT:    [[M:%.*]] = mul i31 [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i31 [[M]]
+;
+  %c = icmp slt i31 %0, 0
+  %s = sub nsw i31 0, %0
+  %r = select i1 %c, i31 %0, i31 %s
+  %m = mul i31 %r, %r
+  ret i31 %m
+}
+
+define i32 @combine_mul_nabs_i32(i32 %0) {
+; CHECK-LABEL: @combine_mul_nabs_i32(
+; CHECK-NEXT:    [[M:%.*]] = mul i32 [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %c = icmp slt i32 %0, 0
+  %s = sub nsw i32 0, %0
+  %r = select i1 %c, i32 %0, i32 %s
+  %m = mul i32 %r, %r
+  ret i32 %m
+}
+
+define <4 x i32> @combine_mul_nabs_v4i32(<4 x i32> %0) {
+; CHECK-LABEL: @combine_mul_nabs_v4i32(
+; CHECK-NEXT:    [[M:%.*]] = mul <4 x i32> [[TMP0:%.*]], [[TMP0]]
+; CHECK-NEXT:    ret <4 x i32> [[M]]
+;
+  %c = icmp slt <4 x i32> %0, zeroinitializer
+  %s = sub nsw <4 x i32> zeroinitializer, %0
+  %r = select <4 x i1> %c, <4 x i32> %0, <4 x i32> %s
+  %m = mul <4 x i32> %r, %r
+  ret <4 x i32> %m
+}
+
+define i32 @combine_mul_abs_intrin(i32 %x) {
+; CHECK-LABEL: @combine_mul_abs_intrin(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %abs = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %mul = mul i32 %abs, %abs
+  ret i32 %mul
+}
+
+define i32 @combine_mul_nabs_intrin(i32 %x) {
+; CHECK-LABEL: @combine_mul_nabs_intrin(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %abs = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %neg = sub i32 0, %abs
+  %mul = mul i32 %neg, %neg
+  ret i32 %mul
+}
+
+; z * splat(0) = splat(0), even for scalable vectors
+define <vscale x 2 x i64> @mul_scalable_splat_zero(<vscale x 2 x i64> %z) {
+; CHECK-LABEL: @mul_scalable_splat_zero(
+; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
+;
+  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %t3 = mul <vscale x 2 x i64> %shuf, %z
+  ret <vscale x 2 x i64> %t3
+}
+
+;
+; fold mul(sub(x,y),negpow2) -> shl(sub(y,x),log2(pow2))
+;
+
+define i32 @mulsub1(i32 %a0, i32 %a1) {
+; CHECK-LABEL: @mulsub1(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub i32 [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[SUB_NEG]], 2
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sub = sub i32 %a1, %a0
+  %mul = mul i32 %sub, -4
+  ret i32 %mul
+}
+
+define <2 x i32> @mulsub1_vec(<2 x i32> %a0, <2 x i32> %a1) {
+; CHECK-LABEL: @mulsub1_vec(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> %a1, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 -4>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @mulsub1_vec_nonuniform(<2 x i32> %a0, <2 x i32> %a1) {
+; CHECK-LABEL: @mulsub1_vec_nonuniform(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> %a1, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 -8>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @mulsub1_vec_nonuniform_undef(<2 x i32> %a0, <2 x i32> %a1) {
+; CHECK-LABEL: @mulsub1_vec_nonuniform_undef(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> %a1, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  ret <2 x i32> %mul
+}
+
+define i32 @mulsub2(i32 %a0) {
+; CHECK-LABEL: @mulsub2(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = shl i32 [[A0:%.*]], 2
+; CHECK-NEXT:    [[MUL:%.*]] = add i32 [[SUB_NEG]], -64
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sub = sub i32 16, %a0
+  %mul = mul i32 %sub, -4
+  ret i32 %mul
+}
+
+define <2 x i32> @mulsub2_vec(<2 x i32> %a0) {
+; CHECK-LABEL: @mulsub2_vec(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = shl <2 x i32> [[A0:%.*]], <i32 2, i32 2>
+; CHECK-NEXT:    [[MUL:%.*]] = add <2 x i32> [[SUB_NEG]], <i32 -64, i32 -64>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> <i32 16, i32 16>, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 -4>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @mulsub2_vec_nonuniform(<2 x i32> %a0) {
+; CHECK-LABEL: @mulsub2_vec_nonuniform(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], <i32 -16, i32 -32>
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> <i32 16, i32 32>, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 -8>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @mulsub2_vec_nonuniform_undef(<2 x i32> %a0) {
+; CHECK-LABEL: @mulsub2_vec_nonuniform_undef(
+; CHECK-NEXT:    [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], <i32 -16, i32 -32>
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %sub = sub <2 x i32> <i32 16, i32 32>, %a0
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  ret <2 x i32> %mul
+}
+
+define i32 @muladd2(i32 %a0) {
+; CHECK-LABEL: @muladd2(
+; CHECK-NEXT:    [[ADD_NEG_NEG:%.*]] = mul i32 [[A0:%.*]], -4
+; CHECK-NEXT:    [[MUL:%.*]] = add i32 [[ADD_NEG_NEG]], -64
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %add = add i32 %a0, 16
+  %mul = mul i32 %add, -4
+  ret i32 %mul
+}
+
+define <2 x i32> @muladd2_vec(<2 x i32> %a0) {
+; CHECK-LABEL: @muladd2_vec(
+; CHECK-NEXT:    [[ADD_NEG_NEG:%.*]] = mul <2 x i32> [[A0:%.*]], <i32 -4, i32 -4>
+; CHECK-NEXT:    [[MUL:%.*]] = add <2 x i32> [[ADD_NEG_NEG]], <i32 -64, i32 -64>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %add = add <2 x i32> %a0, <i32 16, i32 16>
+  %mul = mul <2 x i32> %add, <i32 -4, i32 -4>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @muladd2_vec_nonuniform(<2 x i32> %a0) {
+; CHECK-LABEL: @muladd2_vec_nonuniform(
+; CHECK-NEXT:    [[ADD_NEG:%.*]] = sub <2 x i32> <i32 -16, i32 -32>, [[A0:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %add = add <2 x i32> %a0, <i32 16, i32 32>
+  %mul = mul <2 x i32> %add, <i32 -4, i32 -8>
+  ret <2 x i32> %mul
+}
+
+define <2 x i32> @muladd2_vec_nonuniform_undef(<2 x i32> %a0) {
+; CHECK-LABEL: @muladd2_vec_nonuniform_undef(
+; CHECK-NEXT:    [[ADD_NEG:%.*]] = sub <2 x i32> <i32 -16, i32 -32>, [[A0:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], <i32 2, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[MUL]]
+;
+  %add = add <2 x i32> %a0, <i32 16, i32 32>
+  %mul = mul <2 x i32> %add, <i32 -4, i32 undef>
+  ret <2 x i32> %mul
+}
+
+define i32 @mulmuladd2(i32 %a0, i32 %a1) {
+; CHECK-LABEL: @mulmuladd2(
+; CHECK-NEXT:    [[ADD_NEG:%.*]] = sub i32 -16, [[A0:%.*]]
+; CHECK-NEXT:    [[MUL1_NEG:%.*]] = mul i32 [[ADD_NEG]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = shl i32 [[MUL1_NEG]], 2
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %add = add i32 %a0, 16
+  %mul1 = mul i32 %add, %a1
+  %mul2 = mul i32 %mul1, -4
+  ret i32 %mul2
+}
+define i32 @mulmuladd2_extrause0(i32 %a0, i32 %a1) {
+; CHECK-LABEL: @mulmuladd2_extrause0(
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A0:%.*]], 16
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[ADD]], [[A1:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[MUL1]])
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL1]], -4
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %add = add i32 %a0, 16
+  %mul1 = mul i32 %add, %a1
+  call void @use32(i32 %mul1)
+  %mul2 = mul i32 %mul1, -4
+  ret i32 %mul2
+}
+define i32 @mulmuladd2_extrause1(i32 %a0, i32 %a1) {
+; CHECK-LABEL: @mulmuladd2_extrause1(
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A0:%.*]], 16
+; CHECK-NEXT:    call void @use32(i32 [[ADD]])
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[ADD]], [[A1:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL1]], -4
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %add = add i32 %a0, 16
+  call void @use32(i32 %add)
+  %mul1 = mul i32 %add, %a1
+  %mul2 = mul i32 %mul1, -4
+  ret i32 %mul2
+}
+define i32 @mulmuladd2_extrause2(i32 %a0, i32 %a1) {
+; CHECK-LABEL: @mulmuladd2_extrause2(
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A0:%.*]], 16
+; CHECK-NEXT:    call void @use32(i32 [[ADD]])
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[ADD]], [[A1:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[MUL1]])
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL1]], -4
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %add = add i32 %a0, 16
+  call void @use32(i32 %add)
+  %mul1 = mul i32 %add, %a1
+  call void @use32(i32 %mul1)
+  %mul2 = mul i32 %mul1, -4
+  ret i32 %mul2
+}
+
+define i32 @mulnot(i32 %a0) {
+; CHECK-LABEL: @mulnot(
+; CHECK-NEXT:    [[ADD_NEG:%.*]] = shl i32 [[A0:%.*]], 2
+; CHECK-NEXT:    [[MUL:%.*]] = add i32 [[ADD_NEG]], 4
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %add = xor i32 %a0, -1
+  %mul = mul i32 %add, -4
+  ret i32 %mul
+}
+define i32 @mulnot_extrause(i32 %a0) {
+; CHECK-LABEL: @mulnot_extrause(
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[A0:%.*]], -1
+; CHECK-NEXT:    call void @use32(i32 [[NOT]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NOT]], -4
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %not = xor i32 %a0, -1
+  call void @use32(i32 %not)
+  %mul = mul i32 %not, -4
+  ret i32 %mul
+}

diff  --git a/llvm/test/Transforms/InstCombine/nsw-inseltpoison.ll b/llvm/test/Transforms/InstCombine/nsw-inseltpoison.ll
new file mode 100644
index 000000000000..e64977c23424
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/nsw-inseltpoison.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @sub1(i32 %x) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[Y]], 337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %y = sub i32 0, %x
+  %z = sdiv i32 %y, 337
+  ret i32 %z
+}
+
+define i32 @sub2(i32 %x) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[X:%.*]], -337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %y = sub nsw i32 0, %x
+  %z = sdiv i32 %y, 337
+  ret i32 %z
+}
+
+define i1 @shl_icmp(i64 %X) {
+; CHECK-LABEL: @shl_icmp(
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[B]]
+;
+  %A = shl nuw i64 %X, 2   ; X/4
+  %B = icmp eq i64 %A, 0
+  ret i1 %B
+}
+
+define i64 @shl1(i64 %X, i64* %P) {
+; CHECK-LABEL: @shl1(
+; CHECK-NEXT:    [[A:%.*]] = and i64 [[X:%.*]], 312
+; CHECK-NEXT:    store i64 [[A]], i64* [[P:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i64 [[A]], 8
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  %A = and i64 %X, 312
+  store i64 %A, i64* %P  ; multiple uses of A.
+  %B = shl i64 %A, 8
+  ret i64 %B
+}
+
+define i32 @preserve1(i32 %x) {
+; CHECK-LABEL: @preserve1(
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[X:%.*]], 5
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+  %add = add nsw i32 %x, 2
+  %add3 = add nsw i32 %add, 3
+  ret i32 %add3
+}
+
+define i8 @nopreserve1(i8 %x) {
+; CHECK-LABEL: @nopreserve1(
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], -126
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
+  %add = add nsw i8 %x, 127
+  %add3 = add nsw i8 %add, 3
+  ret i8 %add3
+}
+
+define i8 @nopreserve2(i8 %x) {
+; CHECK-LABEL: @nopreserve2(
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], 3
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
+  %add = add i8 %x, 1
+  %add3 = add nsw i8 %add, 2
+  ret i8 %add3
+}
+
+define i8 @nopreserve3(i8 %A, i8 %B) {
+; CHECK-LABEL: @nopreserve3(
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %x = add i8 %A, 10
+  %y = add i8 %B, 10
+  %add = add nsw i8 %x, %y
+  ret i8 %add
+}
+
+define i8 @nopreserve4(i8 %A, i8 %B) {
+; CHECK-LABEL: @nopreserve4(
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %x = add nsw i8 %A, 10
+  %y = add nsw i8 %B, 10
+  %add = add nsw i8 %x, %y
+  ret i8 %add
+}
+
+define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> poison, <3 x i32> <i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw nsw <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> poison, <3 x i32> <i32 1, i32 0, i32 1>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
+; Negative test - if the shuffle mask contains an undef, we bail out to
+; avoid propagating information that may not be used consistently by callers.
+
+define <3 x i32> @shl_nuw_nsw_shuffle_undef_elt_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_undef_elt_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> poison, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> poison, <3 x i32> <i32 1, i32 undef, i32 0>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
+; Make sure we don't crash on a ConstantExpr shufflevector
+define <vscale x 2 x i64> @mul_nuw_nsw_shuffle_constant_expr(<vscale x 2 x i8> %z) {
+; CHECK-LABEL: @mul_nuw_nsw_shuffle_constant_expr(
+; CHECK-NEXT:    [[XX:%.*]] = zext <vscale x 2 x i8> [[Z:%.*]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[T3:%.*]] = mul <vscale x 2 x i64> [[XX]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[T3]]
+;
+  %xx = zext <vscale x 2 x i8> %z to <vscale x 2 x i64>
+  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer
+  %t3 = mul <vscale x 2 x i64> %shuf, %xx
+  ret <vscale x 2 x i64> %t3
+}

diff  --git a/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
new file mode 100644
index 000000000000..cb8754097445
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/obfuscated_splat-inseltpoison.ll
@@ -0,0 +1,11 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
+  %A = load <4 x float>, <4 x float>* %in_ptr, align 16
+  %B = shufflevector <4 x float> %A, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
+  %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %D, <4 x float> *%out_ptr
+  ret void
+}

diff  --git a/llvm/test/Transforms/InstCombine/pr2645-0-inseltpoison.ll b/llvm/test/Transforms/InstCombine/pr2645-0-inseltpoison.ll
new file mode 100644
index 000000000000..a32a1ae8dd0b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr2645-0-inseltpoison.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | grep "insertelement <4 x float> poison"
+
+; Instcombine should be able to prove that none of the
+; insertelement's first operand's elements are needed.
+
+define internal void @""(i8*) {
+; <label>:1
+        bitcast i8* %0 to i32*          ; <i32*>:2 [#uses=1]
+        load i32, i32* %2, align 1           ; <i32>:3 [#uses=1]
+        getelementptr i8, i8* %0, i32 4             ; <i8*>:4 [#uses=1]
+        bitcast i8* %4 to i32*          ; <i32*>:5 [#uses=1]
+        load i32, i32* %5, align 1           ; <i32>:6 [#uses=1]
+        br label %7
+
+; <label>:7             ; preds = %9, %1
+        %.01 = phi <4 x float> [ undef, %1 ], [ %12, %9 ]               ; <<4 x float>> [#uses=1]
+        %.0 = phi i32 [ %3, %1 ], [ %15, %9 ]           ; <i32> [#uses=3]
+        icmp slt i32 %.0, %6            ; <i1>:8 [#uses=1]
+        br i1 %8, label %9, label %16
+
+; <label>:9             ; preds = %7
+        sitofp i32 %.0 to float         ; <float>:10 [#uses=1]
+        insertelement <4 x float> %.01, float %10, i32 0                ; <<4 x float>>:11 [#uses=1]
+        shufflevector <4 x float> %11, <4 x float> poison, <4 x i32> zeroinitializer             ; <<4 x float>>:12 [#uses=2]
+        getelementptr i8, i8* %0, i32 48            ; <i8*>:13 [#uses=1]
+        bitcast i8* %13 to <4 x float>*         ; <<4 x float>*>:14 [#uses=1]
+        store <4 x float> %12, <4 x float>* %14, align 16
+        add i32 %.0, 2          ; <i32>:15 [#uses=1]
+        br label %7
+
+; <label>:16            ; preds = %7
+        ret void
+}

diff  --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
index b2f815971c46..f4712fc3a908 100644
--- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
@@ -80,7 +80,7 @@ define void @scalarize_phi(i32 * %n, float * %inout) {
 entry:
   %t0 = load volatile float, float * %inout, align 4
   %insert = insertelement <4 x float> poison, float %t0, i32 0
-  %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
+  %splat = shufflevector <4 x float> %insert, <4 x float> poison, <4 x i32> zeroinitializer
   %insert1 = insertelement <4 x float> poison, float 3.0, i32 0
   br label %for.cond
 

diff  --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
index a59782378b9a..11cb489500f0 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
@@ -157,11 +157,11 @@ define <4 x i32> @extract_cond(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv) {
 
 define <4 x i32> @splat_cond(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv) {
 ; CHECK-LABEL: @splat_cond(
-; CHECK-NEXT:    [[SPLATCOND:%.*]] = shufflevector <4 x i1> [[CONDV:%.*]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[SPLATCOND:%.*]] = shufflevector <4 x i1> [[CONDV:%.*]], <4 x i1> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[SPLATCOND]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatcond = shufflevector <4 x i1> %condv, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %splatcond = shufflevector <4 x i1> %condv, <4 x i1> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %r = select <4 x i1> %splatcond, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %r
 }

diff  --git a/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
index 8d1f068136f1..3232cdc49c0f 100644
--- a/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
@@ -77,13 +77,13 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-LABEL: @shl_C1_add_A_C2_v4i32_splat(
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
   %B = insertelement <4 x i32> poison, i32 %A, i32 0
-  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %C = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> zeroinitializer
   %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
   %E = shl <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
   ret <4 x i32> %E
@@ -93,13 +93,13 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-LABEL: @ashr_C1_add_A_C2_v4i32_splat(
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
   %B = insertelement <4 x i32> poison, i32 %A, i32 0
-  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %C = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> zeroinitializer
   %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
   %E = ashr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
   ret <4 x i32> %E
@@ -109,13 +109,13 @@ define <4 x i32> @lshr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-LABEL: @lshr_C1_add_A_C2_v4i32_splat(
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
   %B = insertelement <4 x i32> poison, i32 %A, i32 0
-  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %C = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> zeroinitializer
   %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
   %E = lshr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
   ret <4 x i32> %E

diff  --git a/llvm/test/Transforms/InstCombine/shuffle-cast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shuffle-cast-inseltpoison.ll
new file mode 100644
index 000000000000..bdb3b74b5d20
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shuffle-cast-inseltpoison.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ANY,LE
+; RUN: opt < %s -instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ANY,BE
+
+define <4 x i16> @trunc_little_endian(<4 x i32> %x) {
+; LE-LABEL: @trunc_little_endian(
+; LE-NEXT:    [[R:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
+; LE-NEXT:    ret <4 x i16> [[R]]
+;
+; BE-LABEL: @trunc_little_endian(
+; BE-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; BE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; BE-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+define <4 x i16> @trunc_big_endian(<4 x i32> %x) {
+; LE-LABEL: @trunc_big_endian(
+; LE-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; LE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; LE-NEXT:    ret <4 x i16> [[R]]
+;
+; BE-LABEL: @trunc_big_endian(
+; BE-NEXT:    [[R:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %r
+}
+
+declare void @use_v8i16(<8 x i16>)
+
+; Extra use is ok.
+
+define <2 x i16> @trunc_little_endian_extra_use(<2 x i64> %x) {
+; LE-LABEL: @trunc_little_endian_extra_use(
+; LE-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[X:%.*]] to <8 x i16>
+; LE-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
+; LE-NEXT:    [[R:%.*]] = trunc <2 x i64> [[X]] to <2 x i16>
+; LE-NEXT:    ret <2 x i16> [[R]]
+;
+; BE-LABEL: @trunc_little_endian_extra_use(
+; BE-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[X:%.*]] to <8 x i16>
+; BE-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
+; BE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; BE-NEXT:    ret <2 x i16> [[R]]
+;
+  %b = bitcast <2 x i64> %x to <8 x i16>
+  call void @use_v8i16(<8 x i16> %b)
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+  ret <2 x i16> %r
+}
+
+declare void @use_v12i11(<12 x i11>)
+
+; Weird types are ok.
+
+define <4 x i11> @trunc_big_endian_extra_use(<4 x i33> %x) {
+; LE-LABEL: @trunc_big_endian_extra_use(
+; LE-NEXT:    [[B:%.*]] = bitcast <4 x i33> [[X:%.*]] to <12 x i11>
+; LE-NEXT:    call void @use_v12i11(<12 x i11> [[B]])
+; LE-NEXT:    [[R:%.*]] = shufflevector <12 x i11> [[B]], <12 x i11> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; LE-NEXT:    ret <4 x i11> [[R]]
+;
+; BE-LABEL: @trunc_big_endian_extra_use(
+; BE-NEXT:    [[B:%.*]] = bitcast <4 x i33> [[X:%.*]] to <12 x i11>
+; BE-NEXT:    call void @use_v12i11(<12 x i11> [[B]])
+; BE-NEXT:    [[R:%.*]] = trunc <4 x i33> [[X]] to <4 x i11>
+; BE-NEXT:    ret <4 x i11> [[R]]
+;
+  %b = bitcast <4 x i33> %x to <12 x i11>
+  call void @use_v12i11(<12 x i11> %b)
+  %r = shufflevector <12 x i11> %b, <12 x i11> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  ret <4 x i11> %r
+}
+
+define <4 x i16> @wrong_cast1(i128 %x) {
+; ANY-LABEL: @wrong_cast1(
+; ANY-NEXT:    [[B:%.*]] = bitcast i128 [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast i128 %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+define <4 x i16> @wrong_cast2(<4 x float> %x) {
+; ANY-LABEL: @wrong_cast2(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x float> [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast <4 x float> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+define <4 x half> @wrong_cast3(<4 x i32> %x) {
+; ANY-LABEL: @wrong_cast3(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x half>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x half> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x half>
+  %r = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x half> %r
+}
+
+define <2 x i16> @wrong_shuffle(<4 x i32> %x) {
+; ANY-LABEL: @wrong_shuffle(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <2 x i32> <i32 0, i32 2>
+; ANY-NEXT:    ret <2 x i16> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <2 x i32> <i32 0, i32 2>
+  ret <2 x i16> %r
+}

diff  --git a/llvm/test/Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
new file mode 100644
index 000000000000..f986d10e11a0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Narrow the select operands to eliminate the existing shuffles and replace a wide select with a narrow select.
+
+define <2 x i8> @narrow_shuffle_of_select(<2 x i1> %cmp, <4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i8> [[TMP1]], <2 x i8> [[TMP2]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %x, <4 x i8> %y
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+
+; The 1st shuffle is not extending with undefs, but demanded elements corrects that.
+
+define <2 x i8> @narrow_shuffle_of_select_overspecified_extend(<2 x i1> %cmp, <4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_overspecified_extend(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i8> [[TMP1]], <2 x i8> [[TMP2]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %x, <4 x i8> %y
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+
+; Verify that undef elements are acceptable for identity shuffle mask. Also check FP types.
+
+define <3 x float> @narrow_shuffle_of_select_undefs(<3 x i1> %cmp, <4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_undefs(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[CMP:%.*]], <3 x float> [[TMP1]], <3 x float> [[TMP2]]
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %widecmp = shufflevector <3 x i1> %cmp, <3 x i1> poison, <4 x i32> <i32 undef, i32 1, i32 2, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x float> %x, <4 x float> %y
+  %r = shufflevector <4 x float> %widesel, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
+  ret <3 x float> %r
+}
+
+declare void @use(<4 x i8>)
+declare void @use_cmp(<4 x i1>)
+
+; Negative test - extra use would require more instructions than we started with.
+
+define <2 x i8> @narrow_shuffle_of_select_use1(<2 x i1> %cmp, <4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_use1(
+; CHECK-NEXT:    [[WIDECMP:%.*]] = shufflevector <2 x i1> [[CMP:%.*]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[WIDESEL:%.*]] = select <4 x i1> [[WIDECMP]], <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]
+; CHECK-NEXT:    call void @use(<4 x i8> [[WIDESEL]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[WIDESEL]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %x, <4 x i8> %y
+  call void @use(<4 x i8> %widesel)
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+
+; Negative test - extra use would require more instructions than we started with.
+
+define <2 x i8> @narrow_shuffle_of_select_use2(<2 x i1> %cmp, <4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_use2(
+; CHECK-NEXT:    [[WIDECMP:%.*]] = shufflevector <2 x i1> [[CMP:%.*]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    call void @use_cmp(<4 x i1> [[WIDECMP]])
+; CHECK-NEXT:    [[WIDESEL:%.*]] = select <4 x i1> [[WIDECMP]], <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[WIDESEL]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  call void @use_cmp(<4 x i1> %widecmp)
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %x, <4 x i8> %y
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+
+; Negative test - mismatched types would require extra shuffling.
+
+define <3 x i8> @narrow_shuffle_of_select_mismatch_types1(<2 x i1> %cmp, <4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_mismatch_types1(
+; CHECK-NEXT:    [[WIDECMP:%.*]] = shufflevector <2 x i1> [[CMP:%.*]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[WIDESEL:%.*]] = select <4 x i1> [[WIDECMP]], <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[WIDESEL]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %x, <4 x i8> %y
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %r
+}
+
+; Negative test - mismatched types would require extra shuffling.
+
+define <3 x i8> @narrow_shuffle_of_select_mismatch_types2(<4 x i1> %cmp, <6 x i8> %x, <6 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_mismatch_types2(
+; CHECK-NEXT:    [[WIDECMP:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[WIDESEL:%.*]] = select <6 x i1> [[WIDECMP]], <6 x i8> [[X:%.*]], <6 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <6 x i8> [[WIDESEL]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %widecmp = shufflevector <4 x i1> %cmp, <4 x i1> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef>
+  %widesel = select <6 x i1> %widecmp, <6 x i8> %x, <6 x i8> %y
+  %r = shufflevector <6 x i8> %widesel, <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i8> %r
+}
+
+; Narrowing constants does not require creating new narrowing shuffle instructions.
+
+define <2 x i8> @narrow_shuffle_of_select_consts(<2 x i1> %cmp) {
+; CHECK-LABEL: @narrow_shuffle_of_select_consts(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i8> <i8 -1, i8 -2>, <2 x i8> <i8 1, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> <i8 -1, i8 -2, i8 -3, i8 -4>, <4 x i8> <i8 1, i8 2, i8 3, i8 4>
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+
+; PR38691 - https://bugs.llvm.org/show_bug.cgi?id=38691
+; If the operands are widened only to be narrowed back, then all of the shuffles are unnecessary.
+
+define <2 x i8> @narrow_shuffle_of_select_with_widened_ops(<2 x i1> %cmp, <2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @narrow_shuffle_of_select_with_widened_ops(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP:%.*]], <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %widex = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widey = shufflevector <2 x i8> %y, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widecmp = shufflevector <2 x i1> %cmp, <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widesel = select <4 x i1> %widecmp, <4 x i8> %widex, <4 x i8> %widey
+  %r = shufflevector <4 x i8> %widesel, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i8> %r
+}
+

diff  --git a/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
new file mode 100644
index 000000000000..bca56a88dde8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
@@ -0,0 +1,1467 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Try to eliminate binops and shuffles when the shuffle is a select in disguise:
+; PR37806 - https://bugs.llvm.org/show_bug.cgi?id=37806
+
+define <4 x i32> @add(<4 x i32> %v) {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:    [[S:%.*]] = add <4 x i32> [[V:%.*]], <i32 11, i32 0, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = add <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+; Propagate flags when possible.
+
+define <4 x i32> @add_nuw_nsw(<4 x i32> %v) {
+; CHECK-LABEL: @add_nuw_nsw(
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw <4 x i32> [[V:%.*]], <i32 11, i32 0, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = add nuw nsw <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @add_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @add_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = add <4 x i32> [[V:%.*]], <i32 11, i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = add <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 5, i32 undef, i32 7>
+  ret <4 x i32> %s
+}
+
+; Poison flags must be dropped or undef must be replaced with safe constant.
+
+define <4 x i32> @add_nuw_nsw_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @add_nuw_nsw_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = add <4 x i32> [[V:%.*]], <i32 11, i32 undef, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = add nuw nsw <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+; Constant operand 0 (LHS) could work for some non-commutative binops?
+
+define <4 x i32> @sub(<4 x i32> %v) {
+; CHECK-LABEL: @sub(
+; CHECK-NEXT:    [[B:%.*]] = sub <4 x i32> <i32 poison, i32 poison, i32 poison, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = sub <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+; If any element of the shuffle mask operand is undef, that element of the result is undef.
+; The shuffle is eliminated in this transform, but we can replace a constant element with undef.
+; Preserve flags when possible. It's not safe to propagate poison-generating flags with undef constants.
+
+define <4 x i32> @mul(<4 x i32> %v) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:    [[S:%.*]] = mul <4 x i32> [[V:%.*]], <i32 undef, i32 12, i32 1, i32 14>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = mul nsw nuw <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @shl(<4 x i32> %v) {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    [[S:%.*]] = shl <4 x i32> [[V:%.*]], <i32 0, i32 12, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = shl <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @shl_nsw(<4 x i32> %v) {
+; CHECK-LABEL: @shl_nsw(
+; CHECK-NEXT:    [[S:%.*]] = shl nsw <4 x i32> [[V:%.*]], <i32 0, i32 12, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = shl nsw <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @shl_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @shl_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = shl <4 x i32> [[V:%.*]], <i32 0, i32 12, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = shl <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @shl_nuw_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @shl_nuw_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = shl nuw <4 x i32> [[V:%.*]], <i32 0, i32 0, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = shl nuw <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @lshr_constant_op0(<4 x i32> %v) {
+; CHECK-LABEL: @lshr_constant_op0(
+; CHECK-NEXT:    [[S:%.*]] = lshr <4 x i32> [[V:%.*]], <i32 11, i32 12, i32 0, i32 14>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = lshr <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @lshr_exact_constant_op0(<4 x i32> %v) {
+; CHECK-LABEL: @lshr_exact_constant_op0(
+; CHECK-NEXT:    [[S:%.*]] = lshr exact <4 x i32> [[V:%.*]], <i32 11, i32 12, i32 0, i32 14>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = lshr exact <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @lshr_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @lshr_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = shl <4 x i32> [[V:%.*]], <i32 0, i32 12, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = shl <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @lshr_exact_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @lshr_exact_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = lshr exact <4 x i32> [[V:%.*]], <i32 0, i32 0, i32 13, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = lshr exact  <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @lshr_constant_op1(<4 x i32> %v) {
+; CHECK-LABEL: @lshr_constant_op1(
+; CHECK-NEXT:    [[B:%.*]] = lshr exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = lshr exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+; Try weird types.
+
+define <3 x i32> @ashr(<3 x i32> %v) {
+; CHECK-LABEL: @ashr(
+; CHECK-NEXT:    [[S:%.*]] = ashr <3 x i32> [[V:%.*]], <i32 0, i32 12, i32 13>
+; CHECK-NEXT:    ret <3 x i32> [[S]]
+;
+  %b = ashr <3 x i32> %v, <i32 11, i32 12, i32 13>
+  %s = shufflevector <3 x i32> %b, <3 x i32> %v, <3 x i32> <i32 3, i32 1, i32 2>
+  ret <3 x i32> %s
+}
+
+define <3 x i42> @and(<3 x i42> %v) {
+; CHECK-LABEL: @and(
+; CHECK-NEXT:    [[S:%.*]] = and <3 x i42> [[V:%.*]], <i42 -1, i42 12, i42 undef>
+; CHECK-NEXT:    ret <3 x i42> [[S]]
+;
+  %b = and <3 x i42> %v, <i42 11, i42 12, i42 13>
+  %s = shufflevector <3 x i42> %v, <3 x i42> %b, <3 x i32> <i32 0, i32 4, i32 undef>
+  ret <3 x i42> %s
+}
+
+; It doesn't matter if the intermediate op has extra uses.
+
+declare void @use_v4i32(<4 x i32>)
+
+define <4 x i32> @or(<4 x i32> %v) {
+; CHECK-LABEL: @or(
+; CHECK-NEXT:    [[B:%.*]] = or <4 x i32> [[V:%.*]], <i32 11, i32 12, i32 13, i32 14>
+; CHECK-NEXT:    [[S:%.*]] = or <4 x i32> [[V]], <i32 0, i32 0, i32 13, i32 14>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[B]])
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = or <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  call void @use_v4i32(<4 x i32> %b)
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @xor(<4 x i32> %v) {
+; CHECK-LABEL: @xor(
+; CHECK-NEXT:    [[S:%.*]] = xor <4 x i32> [[V:%.*]], <i32 0, i32 12, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = xor <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @udiv(<4 x i32> %v) {
+; CHECK-LABEL: @udiv(
+; CHECK-NEXT:    [[B:%.*]] = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @udiv_exact(<4 x i32> %v) {
+; CHECK-LABEL: @udiv_exact(
+; CHECK-NEXT:    [[B:%.*]] = udiv exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = udiv exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @udiv_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @udiv_undef_mask_elt(
+; CHECK-NEXT:    [[B:%.*]] = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @udiv_exact_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @udiv_exact_undef_mask_elt(
+; CHECK-NEXT:    [[B:%.*]] = udiv exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = udiv exact <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sdiv(<4 x i32> %v) {
+; CHECK-LABEL: @sdiv(
+; CHECK-NEXT:    [[S:%.*]] = sdiv <4 x i32> [[V:%.*]], <i32 11, i32 1, i32 13, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = sdiv <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sdiv_exact(<4 x i32> %v) {
+; CHECK-LABEL: @sdiv_exact(
+; CHECK-NEXT:    [[S:%.*]] = sdiv exact <4 x i32> [[V:%.*]], <i32 11, i32 1, i32 13, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = sdiv exact <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i32> %s
+}
+
+; Div/rem need special handling if the shuffle has undef elements.
+
+define <4 x i32> @sdiv_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @sdiv_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = sdiv <4 x i32> [[V:%.*]], <i32 1, i32 1, i32 13, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = sdiv <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 undef, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sdiv_exact_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @sdiv_exact_undef_mask_elt(
+; CHECK-NEXT:    [[S:%.*]] = sdiv exact <4 x i32> [[V:%.*]], <i32 1, i32 1, i32 13, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = sdiv exact <4 x i32> %v, <i32 11, i32 12, i32 13, i32 14>
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 undef, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @urem(<4 x i32> %v) {
+; CHECK-LABEL: @urem(
+; CHECK-NEXT:    [[B:%.*]] = urem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = urem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @urem_undef_mask_elt(<4 x i32> %v) {
+; CHECK-LABEL: @urem_undef_mask_elt(
+; CHECK-NEXT:    [[B:%.*]] = urem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = urem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %b, <4 x i32> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @srem(<4 x i32> %v) {
+; CHECK-LABEL: @srem(
+; CHECK-NEXT:    [[B:%.*]] = srem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %b = srem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %v
+  %s = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %s
+}
+
+; Try FP ops/types.
+
+define <4 x float> @fadd(<4 x float> %v) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:    [[S:%.*]] = fadd <4 x float> [[V:%.*]], <float 4.100000e+01, float 4.200000e+01, float -0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %b = fadd <4 x float> %v, <float 41.0, float 42.0, float 43.0, float 44.0>
+  %s = shufflevector <4 x float> %b, <4 x float> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %s
+}
+
+define <4 x double> @fsub(<4 x double> %v) {
+; CHECK-LABEL: @fsub(
+; CHECK-NEXT:    [[B:%.*]] = fsub <4 x double> <double poison, double poison, double 4.300000e+01, double 4.400000e+01>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[V]], <4 x double> [[B]], <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[S]]
+;
+  %b = fsub <4 x double> <double 41.0, double 42.0, double 43.0, double 44.0>, %v
+  %s = shufflevector <4 x double> %v, <4 x double> %b, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %s
+}
+
+; Propagate any FMF.
+
+define <4 x float> @fmul(<4 x float> %v) {
+; CHECK-LABEL: @fmul(
+; CHECK-NEXT:    [[S:%.*]] = fmul nnan ninf <4 x float> [[V:%.*]], <float 4.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %b = fmul nnan ninf <4 x float> %v, <float 41.0, float 42.0, float 43.0, float 44.0>
+  %s = shufflevector <4 x float> %b, <4 x float> %v, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %s
+}
+
+define <4 x double> @fdiv_constant_op0(<4 x double> %v) {
+; CHECK-LABEL: @fdiv_constant_op0(
+; CHECK-NEXT:    [[B:%.*]] = fdiv fast <4 x double> <double poison, double poison, double 4.300000e+01, double 4.400000e+01>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[V]], <4 x double> [[B]], <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[S]]
+;
+  %b = fdiv fast <4 x double> <double 41.0, double 42.0, double 43.0, double 44.0>, %v
+  %s = shufflevector <4 x double> %v, <4 x double> %b, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %s
+}
+
+define <4 x double> @fdiv_constant_op1(<4 x double> %v) {
+; CHECK-LABEL: @fdiv_constant_op1(
+; CHECK-NEXT:    [[S:%.*]] = fdiv reassoc <4 x double> [[V:%.*]], <double undef, double 1.000000e+00, double 4.300000e+01, double 4.400000e+01>
+; CHECK-NEXT:    ret <4 x double> [[S]]
+;
+  %b = fdiv reassoc <4 x double> %v, <double 41.0, double 42.0, double 43.0, double 44.0>
+  %s = shufflevector <4 x double> %v, <4 x double> %b, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %s
+}
+
+define <4 x double> @frem(<4 x double> %v) {
+; CHECK-LABEL: @frem(
+; CHECK-NEXT:    [[B:%.*]] = frem <4 x double> <double 4.100000e+01, double 4.200000e+01, double poison, double poison>, [[V:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[S]]
+;
+  %b = frem <4 x double> <double 41.0, double 42.0, double 43.0, double 44.0>, %v
+  %s = shufflevector <4 x double> %b, <4 x double> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %s
+}
+
+; Tests where both operands of the shuffle are binops with the same opcode.
+
+define <4 x i32> @add_add(<4 x i32> %v0) {
+; CHECK-LABEL: @add_add(
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[V0:%.*]], <i32 1, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = add <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = add <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @add_add_nsw(<4 x i32> %v0) {
+; CHECK-LABEL: @add_add_nsw(
+; CHECK-NEXT:    [[T3:%.*]] = add nsw <4 x i32> [[V0:%.*]], <i32 1, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = add nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = add nsw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @add_add_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @add_add_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[V0:%.*]], <i32 1, i32 6, i32 undef, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = add <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = add <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 undef, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Poison flags must be dropped or undef must be replaced with safe constant.
+
+define <4 x i32> @add_add_nsw_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @add_add_nsw_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[V0:%.*]], <i32 1, i32 6, i32 undef, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = add nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = add nsw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 undef, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Constant operand 0 (LHS) also works.
+
+define <4 x i32> @sub_sub(<4 x i32> %v0) {
+; CHECK-LABEL: @sub_sub(
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 1, i32 2, i32 3, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sub_sub_nuw(<4 x i32> %v0) {
+; CHECK-LABEL: @sub_sub_nuw(
+; CHECK-NEXT:    [[T3:%.*]] = sub nuw <4 x i32> <i32 1, i32 2, i32 3, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub nuw <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub nuw <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sub_sub_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @sub_sub_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 undef, i32 2, i32 3, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Poison flags must be dropped or undef must be replaced with safe constant.
+
+define <4 x i32> @sub_sub_nuw_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @sub_sub_nuw_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 undef, i32 2, i32 3, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub nuw <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub nuw <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; If any element of the shuffle mask operand is undef, that element of the result is undef.
+; The shuffle is eliminated in this transform, but we can replace a constant element with undef.
+
+define <4 x i32> @mul_mul(<4 x i32> %v0) {
+; CHECK-LABEL: @mul_mul(
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[V0:%.*]], <i32 undef, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Preserve flags when possible.
+
+define <4 x i32> @shl_shl(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_shl(
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 5, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @shl_shl_nuw(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_shl_nuw(
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw <4 x i32> [[V0:%.*]], <i32 5, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nuw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Shift by undef is poison. Undef must be replaced by safe constant.
+
+define <4 x i32> @shl_shl_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_shl_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 0, i32 6, i32 3, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; Shift by undef is poison. Undef must be replaced by safe constant.
+
+define <4 x i32> @shl_shl_nuw_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_shl_nuw_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw <4 x i32> [[V0:%.*]], <i32 0, i32 6, i32 3, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nuw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; Can't propagate the flag here.
+
+define <4 x i32> @lshr_lshr(<4 x i32> %v0) {
+; CHECK-LABEL: @lshr_lshr(
+; CHECK-NEXT:    [[T3:%.*]] = lshr <4 x i32> <i32 5, i32 6, i32 3, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = lshr exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = lshr <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Try weird types.
+
+define <3 x i32> @ashr_ashr(<3 x i32> %v0) {
+; CHECK-LABEL: @ashr_ashr(
+; CHECK-NEXT:    [[T3:%.*]] = ashr <3 x i32> [[V0:%.*]], <i32 4, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t1 = ashr <3 x i32> %v0, <i32 1, i32 2, i32 3>
+  %t2 = ashr <3 x i32> %v0, <i32 4, i32 5, i32 6>
+  %t3 = shufflevector <3 x i32> %t1, <3 x i32> %t2, <3 x i32> <i32 3, i32 1, i32 2>
+  ret <3 x i32> %t3
+}
+
+define <3 x i42> @and_and(<3 x i42> %v0) {
+; CHECK-LABEL: @and_and(
+; CHECK-NEXT:    [[T3:%.*]] = and <3 x i42> [[V0:%.*]], <i42 1, i42 5, i42 undef>
+; CHECK-NEXT:    ret <3 x i42> [[T3]]
+;
+  %t1 = and <3 x i42> %v0, <i42 1, i42 2, i42 3>
+  %t2 = and <3 x i42> %v0, <i42 4, i42 5, i42 6>
+  %t3 = shufflevector <3 x i42> %t1, <3 x i42> %t2, <3 x i32> <i32 0, i32 4, i32 undef>
+  ret <3 x i42> %t3
+}
+
+; It doesn't matter if the intermediate ops have extra uses.
+
+define <4 x i32> @or_or(<4 x i32> %v0) {
+; CHECK-LABEL: @or_or(
+; CHECK-NEXT:    [[T1:%.*]] = or <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[T3:%.*]] = or <4 x i32> [[V0]], <i32 5, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = or <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = or <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  call void @use_v4i32(<4 x i32> %t1)
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @xor_xor(<4 x i32> %v0) {
+; CHECK-LABEL: @xor_xor(
+; CHECK-NEXT:    [[T2:%.*]] = xor <4 x i32> [[V0:%.*]], <i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[T3:%.*]] = xor <4 x i32> [[V0]], <i32 1, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T2]])
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = xor <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = xor <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  call void @use_v4i32(<4 x i32> %t2)
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @udiv_udiv(<4 x i32> %v0) {
+; CHECK-LABEL: @udiv_udiv(
+; CHECK-NEXT:    [[T1:%.*]] = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = udiv <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V0]]
+; CHECK-NEXT:    [[T3:%.*]] = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 8>, [[V0]]
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T2]])
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = udiv <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  call void @use_v4i32(<4 x i32> %t1)
+  call void @use_v4i32(<4 x i32> %t2)
+  ret <4 x i32> %t3
+}
+
+; Div/rem need special handling if the shuffle has undef elements.
+
+define <4 x i32> @sdiv_sdiv(<4 x i32> %v0) {
+; CHECK-LABEL: @sdiv_sdiv(
+; CHECK-NEXT:    [[T3:%.*]] = sdiv <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 7, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sdiv_sdiv_exact(<4 x i32> %v0) {
+; CHECK-LABEL: @sdiv_sdiv_exact(
+; CHECK-NEXT:    [[T3:%.*]] = sdiv exact <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 7, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv exact <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv exact <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sdiv_sdiv_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @sdiv_sdiv_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = sdiv <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 7, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sdiv_sdiv_exact_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @sdiv_sdiv_exact_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = sdiv exact <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 7, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv exact <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv exact <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @urem_urem(<4 x i32> %v0) {
+; CHECK-LABEL: @urem_urem(
+; CHECK-NEXT:    [[T3:%.*]] = urem <4 x i32> <i32 1, i32 2, i32 7, i32 8>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = urem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = urem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %t3
+}
+
+; This is folded by using a safe constant.
+
+define <4 x i32> @urem_urem_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @urem_urem_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = urem <4 x i32> <i32 1, i32 2, i32 7, i32 0>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = urem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = urem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @srem_srem(<4 x i32> %v0) {
+; CHECK-LABEL: @srem_srem(
+; CHECK-NEXT:    [[T3:%.*]] = srem <4 x i32> <i32 1, i32 2, i32 7, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = srem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = srem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %t3
+}
+
+; This is folded by using a safe constant.
+
+define <4 x i32> @srem_srem_undef_mask_elt(<4 x i32> %v0) {
+; CHECK-LABEL: @srem_srem_undef_mask_elt(
+; CHECK-NEXT:    [[T3:%.*]] = srem <4 x i32> <i32 1, i32 0, i32 7, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = srem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = srem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v0
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 undef, i32 6, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Try FP ops/types.
+
+define <4 x float> @fadd_fadd(<4 x float> %v0) {
+; CHECK-LABEL: @fadd_fadd(
+; CHECK-NEXT:    [[T3:%.*]] = fadd <4 x float> [[V0:%.*]], <float 1.000000e+00, float 2.000000e+00, float 7.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %t1 = fadd <4 x float> %v0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %t2 = fadd <4 x float> %v0, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %t3 = shufflevector <4 x float> %t1, <4 x float> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %t3
+}
+
+define <4 x double> @fsub_fsub(<4 x double> %v0) {
+; CHECK-LABEL: @fsub_fsub(
+; CHECK-NEXT:    [[T3:%.*]] = fsub <4 x double> <double undef, double 2.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = fsub <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = fsub <4 x double> <double 5.0, double 6.0, double 7.0, double 8.0>, %v0
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+; Intersect any FMF.
+
+define <4 x float> @fmul_fmul(<4 x float> %v0) {
+; CHECK-LABEL: @fmul_fmul(
+; CHECK-NEXT:    [[T3:%.*]] = fmul nnan ninf <4 x float> [[V0:%.*]], <float 1.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %t1 = fmul nnan ninf <4 x float> %v0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %t2 = fmul nnan ninf <4 x float> %v0, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %t3 = shufflevector <4 x float> %t1, <4 x float> %t2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %t3
+}
+
+define <4 x double> @fdiv_fdiv(<4 x double> %v0) {
+; CHECK-LABEL: @fdiv_fdiv(
+; CHECK-NEXT:    [[T3:%.*]] = fdiv nnan arcp <4 x double> <double undef, double 2.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[V0:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = fdiv fast <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = fdiv nnan arcp <4 x double> <double 5.0, double 6.0, double 7.0, double 8.0>, %v0
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+; The variable operand must be either the first operand or second operand in both binops.
+
+define <4 x double> @frem_frem(<4 x double> %v0) {
+; CHECK-LABEL: @frem_frem(
+; CHECK-NEXT:    [[T1:%.*]] = frem <4 x double> <double 1.000000e+00, double 2.000000e+00, double poison, double poison>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = frem <4 x double> [[V0]], <double poison, double poison, double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x double> [[T1]], <4 x double> [[T2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = frem <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = frem <4 x double> %v0, <double 5.0, double 6.0, double 7.0, double 8.0>
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+define <4 x i32> @add_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @add_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[TMP1]], <i32 1, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = add <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = add <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Constant operand 0 (LHS) also works.
+
+define <4 x i32> @sub_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sub_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 1, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sub_2_vars_nsw(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sub_2_vars_nsw(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw <4 x i32> <i32 1, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub nsw <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub nsw <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sub_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sub_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 undef, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Poison flags must be dropped or undef must be replaced with safe constant.
+
+define <4 x i32> @sub_2_vars_nsw_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sub_2_vars_nsw_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = sub <4 x i32> <i32 undef, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sub nsw <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = sub nsw <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; If any element of the shuffle mask operand is undef, that element of the result is undef.
+; The shuffle is eliminated in this transform, but we can replace a constant element with undef.
+
+define <4 x i32> @mul_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @mul_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[TMP1]], <i32 1, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @mul_2_vars_nuw(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @mul_2_vars_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul nuw <4 x i32> [[TMP1]], <i32 1, i32 6, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul nuw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @mul_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @mul_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[TMP1]], <i32 1, i32 undef, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Poison flags must be dropped or undef must be replaced with safe constant.
+
+define <4 x i32> @mul_2_vars_nuw_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @mul_2_vars_nuw_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[TMP1]], <i32 1, i32 undef, i32 3, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul nuw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Preserve flags when possible.
+
+define <4 x i32> @shl_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @shl_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @shl_2_vars_nsw(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @shl_2_vars_nsw(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = shl nsw <4 x i32> [[TMP1]], <i32 1, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nsw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Shift by undef is poison. Undef is replaced by safe constant.
+
+define <4 x i32> @shl_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @shl_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 6, i32 3, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; Shift by undef is poison. Undef is replaced by safe constant.
+
+define <4 x i32> @shl_2_vars_nsw_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @shl_2_vars_nsw_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = shl nsw <4 x i32> [[TMP1]], <i32 0, i32 6, i32 3, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nsw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; Can't propagate the flag here.
+
+define <4 x i32> @lshr_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @lshr_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = lshr <4 x i32> <i32 5, i32 6, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = lshr <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = lshr exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @lshr_2_vars_exact(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @lshr_2_vars_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = lshr exact <4 x i32> <i32 5, i32 6, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = lshr exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = lshr exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; TODO: This would require a new shuffle mask (replace undef with op0 or op1 lane). Otherwise, we have shift-by-undef.
+
+define <4 x i32> @lshr_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @lshr_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[T1:%.*]] = lshr <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V1:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = lshr <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = lshr <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; TODO: This would require a new shuffle mask (replace undef with op0 or op1 lane). Otherwise, we have shift-by-undef.
+
+define <4 x i32> @lshr_2_vars_exact_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @lshr_2_vars_exact_undef_mask_elt(
+; CHECK-NEXT:    [[T1:%.*]] = lshr exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = lshr exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V1:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = lshr exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = lshr exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Try weird types.
+
+define <3 x i32> @ashr_2_vars(<3 x i32> %v0, <3 x i32> %v1) {
+; CHECK-LABEL: @ashr_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[V1:%.*]], <3 x i32> [[V0:%.*]], <3 x i32> <i32 0, i32 4, i32 5>
+; CHECK-NEXT:    [[T3:%.*]] = ashr <3 x i32> [[TMP1]], <i32 4, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t1 = ashr <3 x i32> %v0, <i32 1, i32 2, i32 3>
+  %t2 = ashr <3 x i32> %v1, <i32 4, i32 5, i32 6>
+  %t3 = shufflevector <3 x i32> %t1, <3 x i32> %t2, <3 x i32> <i32 3, i32 1, i32 2>
+  ret <3 x i32> %t3
+}
+
+define <3 x i42> @and_2_vars(<3 x i42> %v0, <3 x i42> %v1) {
+; CHECK-LABEL: @and_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i42> [[V0:%.*]], <3 x i42> [[V1:%.*]], <3 x i32> <i32 0, i32 4, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = and <3 x i42> [[TMP1]], <i42 1, i42 5, i42 undef>
+; CHECK-NEXT:    ret <3 x i42> [[T3]]
+;
+  %t1 = and <3 x i42> %v0, <i42 1, i42 2, i42 3>
+  %t2 = and <3 x i42> %v1, <i42 4, i42 5, i42 6>
+  %t3 = shufflevector <3 x i42> %t1, <3 x i42> %t2, <3 x i32> <i32 0, i32 4, i32 undef>
+  ret <3 x i42> %t3
+}
+
+; It doesn't matter if only one intermediate op has extra uses.
+
+define <4 x i32> @or_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @or_2_vars(
+; CHECK-NEXT:    [[T1:%.*]] = or <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = or <4 x i32> [[TMP1]], <i32 5, i32 6, i32 3, i32 4>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = or <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = or <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  call void @use_v4i32(<4 x i32> %t1)
+  ret <4 x i32> %t3
+}
+
+; But we don't transform if both intermediate values have extra uses.
+
+define <4 x i32> @xor_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @xor_2_vars(
+; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[T2:%.*]] = xor <4 x i32> [[V1:%.*]], <i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T1]])
+; CHECK-NEXT:    call void @use_v4i32(<4 x i32> [[T2]])
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = xor <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = xor <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  call void @use_v4i32(<4 x i32> %t1)
+  call void @use_v4i32(<4 x i32> %t2)
+  ret <4 x i32> %t3
+}
+
+; Div/rem need special handling if the shuffle has undef elements.
+
+define <4 x i32> @udiv_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @udiv_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = udiv <4 x i32> <i32 5, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = udiv <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @udiv_2_vars_exact(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @udiv_2_vars_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = udiv exact <4 x i32> <i32 5, i32 2, i32 3, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = udiv exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = udiv exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; TODO: This could be transformed using a safe constant.
+
+define <4 x i32> @udiv_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @udiv_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[T1:%.*]] = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = udiv <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V1:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = udiv <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = udiv <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; TODO: This could be transformed using a safe constant.
+
+define <4 x i32> @udiv_2_vars_exact_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @udiv_2_vars_exact_undef_mask_elt(
+; CHECK-NEXT:    [[T1:%.*]] = udiv exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = udiv exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V1:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = udiv exact <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = udiv exact <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+  ret <4 x i32> %t3
+}
+
+; If the shuffle has no undefs, it's safe to shuffle the variables first.
+
+define <4 x i32> @sdiv_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sdiv_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 1, i32 2, i32 7, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @sdiv_2_vars_exact(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sdiv_2_vars_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = sdiv exact <4 x i32> [[TMP1]], <i32 1, i32 2, i32 7, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv exact <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv exact <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Div by undef is UB. Undef is replaced by safe constant.
+
+define <4 x i32> @sdiv_2_vars_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sdiv_2_vars_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 1, i32 2, i32 7, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; Div by undef is UB. Undef is replaced by safe constant.
+
+define <4 x i32> @sdiv_2_vars_exact_undef_mask_elt(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @sdiv_2_vars_exact_undef_mask_elt(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = sdiv exact <4 x i32> [[TMP1]], <i32 1, i32 2, i32 7, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = sdiv exact <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = sdiv exact <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  ret <4 x i32> %t3
+}
+
+; If the shuffle has no undefs, it's safe to shuffle the variables first.
+
+define <4 x i32> @urem_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @urem_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = urem <4 x i32> <i32 1, i32 2, i32 7, i32 8>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = urem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = urem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @srem_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @srem_2_vars(
+; CHECK-NEXT:    [[T1:%.*]] = srem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = srem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, [[V1:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 0, i32 undef, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = srem <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = srem <4 x i32> <i32 5, i32 6, i32 7, i32 8>, %v1
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 undef, i32 6, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Try FP ops/types.
+
+define <4 x float> @fadd_2_vars(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: @fadd_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V0:%.*]], <4 x float> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 7.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %t1 = fadd <4 x float> %v0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %t2 = fadd <4 x float> %v1, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %t3 = shufflevector <4 x float> %t1, <4 x float> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %t3
+}
+
+define <4 x double> @fsub_2_vars(<4 x double> %v0, <4 x double> %v1) {
+; CHECK-LABEL: @fsub_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = fsub <4 x double> <double undef, double 2.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = fsub <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = fsub <4 x double> <double 5.0, double 6.0, double 7.0, double 8.0>, %v1
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+; Intersect any FMF.
+
+define <4 x float> @fmul_2_vars(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: @fmul_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V0:%.*]], <4 x float> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc nsz <4 x float> [[TMP1]], <float 1.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[T3]]
+;
+  %t1 = fmul reassoc nsz <4 x float> %v0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %t2 = fmul reassoc nsz <4 x float> %v1, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %t3 = shufflevector <4 x float> %t1, <4 x float> %t2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %t3
+}
+
+define <4 x double> @frem_2_vars(<4 x double> %v0, <4 x double> %v1) {
+; CHECK-LABEL: @frem_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = frem nnan <4 x double> <double undef, double 2.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = frem nnan ninf <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = frem nnan arcp <4 x double> <double 5.0, double 6.0, double 7.0, double 8.0>, %v1
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 undef, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+; The variable operand must be either the first operand or second operand in both binops.
+
+define <4 x double> @fdiv_2_vars(<4 x double> %v0, <4 x double> %v1) {
+; CHECK-LABEL: @fdiv_2_vars(
+; CHECK-NEXT:    [[T1:%.*]] = fdiv <4 x double> <double 1.000000e+00, double 2.000000e+00, double poison, double poison>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = fdiv <4 x double> [[V1:%.*]], <double poison, double poison, double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x double> [[T1]], <4 x double> [[T2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[T3]]
+;
+  %t1 = fdiv <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, %v0
+  %t2 = fdiv <4 x double> %v1, <double 5.0, double 6.0, double 7.0, double 8.0>
+  %t3 = shufflevector <4 x double> %t1, <4 x double> %t2, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %t3
+}
+
+; Shift-left with constant shift amount can be converted to mul to enable the fold.
+
+define <4 x i32> @mul_shl(<4 x i32> %v0) {
+; CHECK-LABEL: @mul_shl(
+; CHECK-NEXT:    [[T3:%.*]] = mul nuw <4 x i32> [[V0:%.*]], <i32 32, i32 64, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nuw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Try with shift as operand 0 of the shuffle; 'nsw' is dropped for safety, but that could be improved.
+
+define <4 x i32> @shl_mul(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_mul(
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[V0:%.*]], <i32 5, i32 undef, i32 8, i32 16>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul nsw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Demanded elements + simplification can remove the mul alone, but that's not the best case.
+
+define <4 x i32> @mul_is_nop_shl(<4 x i32> %v0) {
+; CHECK-LABEL: @mul_is_nop_shl(
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 0, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %t3
+}
+
+; Negative test: shift amount (operand 1) must be constant.
+
+define <4 x i32> @shl_mul_not_constant_shift_amount(<4 x i32> %v0) {
+; CHECK-LABEL: @shl_mul_not_constant_shift_amount(
+; CHECK-NEXT:    [[T1:%.*]] = shl <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = mul <4 x i32> [[V0]], <i32 5, i32 6, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T2]], <4 x i32> [[T1]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %v0
+  %t2 = mul <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Try with 2 variable inputs.
+
+define <4 x i32> @mul_shl_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @mul_shl_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul nuw <4 x i32> [[TMP1]], <i32 32, i32 64, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = mul nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shl nuw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i32> @shl_mul_2_vars(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @shl_mul_2_vars(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <4 x i32> <i32 0, i32 undef, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = mul <4 x i32> [[TMP1]], <i32 5, i32 undef, i32 8, i32 16>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %t1 = shl nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
+  %t2 = mul nsw <4 x i32> %v1, <i32 5, i32 6, i32 7, i32 8>
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Or with constant can be converted to add to enable the fold.
+; The 'shl' is here to allow analysis to determine that the 'or' can be transformed to 'add'.
+; TODO: The 'or' constant is limited to a splat.
+
+define <4 x i32> @add_or(<4 x i32> %v) {
+; CHECK-LABEL: @add_or(
+; CHECK-NEXT:    [[V0:%.*]] = shl <4 x i32> [[V:%.*]], <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[V0]], <i32 31, i32 31, i32 65536, i32 65537>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %v0 = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>                   ; clear the bottom bits
+  %t1 = add <4 x i32> %v0, <i32 65534, i32 65535, i32 65536, i32 65537>  ; this can't be converted to 'or'
+  %t2 = or <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>               ; set the bottom bits
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+; Try with 'or' as operand 0 of the shuffle.
+
+define <4 x i8> @or_add(<4 x i8> %v) {
+; CHECK-LABEL: @or_add(
+; CHECK-NEXT:    [[V0:%.*]] = lshr <4 x i8> [[V:%.*]], <i8 3, i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[T3:%.*]] = add nuw nsw <4 x i8> [[V0]], <i8 1, i8 2, i8 -64, i8 -64>
+; CHECK-NEXT:    ret <4 x i8> [[T3]]
+;
+  %v0 = lshr <4 x i8> %v, <i8 3, i8 3, i8 3, i8 3>          ; clear the top bits
+  %t1 = or <4 x i8> %v0, <i8 192, i8 192, i8 192, i8 192>   ; set some top bits
+  %t2 = add nsw nuw <4 x i8> %v0, <i8 1, i8 2, i8 3, i8 4>  ; this can't be converted to 'or'
+  %t3 = shufflevector <4 x i8> %t1, <4 x i8> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i8> %t3
+}
+
+; Negative test: not all 'or' insts can be converted to 'add'.
+
+define <4 x i8> @or_add_not_enough_masking(<4 x i8> %v) {
+; CHECK-LABEL: @or_add_not_enough_masking(
+; CHECK-NEXT:    [[V0:%.*]] = lshr <4 x i8> [[V:%.*]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[T1:%.*]] = or <4 x i8> [[V0]], <i8 poison, i8 poison, i8 -64, i8 -64>
+; CHECK-NEXT:    [[T2:%.*]] = add <4 x i8> [[V0]], <i8 1, i8 2, i8 poison, i8 poison>
+; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i8> [[T2]], <4 x i8> [[T1]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i8> [[T3]]
+;
+  %v0 = lshr <4 x i8> %v, <i8 1, i8 1, i8 1, i8 1>          ; clear not enough top bits
+  %t1 = or <4 x i8> %v0, <i8 192, i8 192, i8 192, i8 192>   ; set some top bits
+  %t2 = add nsw nuw <4 x i8> %v0, <i8 1, i8 2, i8 3, i8 4>  ; this can't be converted to 'or'
+  %t3 = shufflevector <4 x i8> %t1, <4 x i8> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i8> %t3
+}
+
+; Try with 2 variable inputs.
+
+define <4 x i32> @add_or_2_vars(<4 x i32> %v, <4 x i32> %v1) {
+; CHECK-LABEL: @add_or_2_vars(
+; CHECK-NEXT:    [[V0:%.*]] = shl <4 x i32> [[V:%.*]], <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = add <4 x i32> [[TMP1]], <i32 31, i32 31, i32 65536, i32 65537>
+; CHECK-NEXT:    ret <4 x i32> [[T3]]
+;
+  %v0 = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>                   ; clear the bottom bits
+  %t1 = add <4 x i32> %v1, <i32 65534, i32 65535, i32 65536, i32 65537>  ; this can't be converted to 'or'
+  %t2 = or <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>               ; set the bottom bits
+  %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %t3
+}
+
+define <4 x i8> @or_add_2_vars(<4 x i8> %v, <4 x i8> %v1) {
+; CHECK-LABEL: @or_add_2_vars(
+; CHECK-NEXT:    [[V0:%.*]] = lshr <4 x i8> [[V:%.*]], <i8 3, i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[V1:%.*]], <4 x i8> [[V0]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = add nuw nsw <4 x i8> [[TMP1]], <i8 1, i8 2, i8 -64, i8 -64>
+; CHECK-NEXT:    ret <4 x i8> [[T3]]
+;
+  %v0 = lshr <4 x i8> %v, <i8 3, i8 3, i8 3, i8 3>          ; clear the top bits
+  %t1 = or <4 x i8> %v0, <i8 192, i8 192, i8 192, i8 192>   ; set some top bits
+  %t2 = add nsw nuw <4 x i8> %v1, <i8 1, i8 2, i8 3, i8 4>  ; this can't be converted to 'or'
+  %t3 = shufflevector <4 x i8> %t1, <4 x i8> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i8> %t3
+}
+
+; The undef operand is used to simplify the shuffle mask, but don't assert that too soon.
+
+define <4 x i32> @PR41419(<4 x i32> %v) {
+; CHECK-LABEL: @PR41419(
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %s = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %s
+}
+

diff  --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
new file mode 100644
index 000000000000..0a88370df05a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare void @use(<4 x i16>)
+
+define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[V_BC:%.*]] = bitcast <16 x i8> [[W:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[V_EXTRACT:%.*]] = extractelement <4 x i32> [[V_BC]], i32 3
+; CHECK-NEXT:    [[V_BC1:%.*]] = bitcast <16 x i8> [[W]] to <4 x float>
+; CHECK-NEXT:    [[V_EXTRACT2:%.*]] = extractelement <4 x float> [[V_BC1]], i32 3
+; CHECK-NEXT:    store i32 [[V_EXTRACT]], i32* [[O1:%.*]], align 4
+; CHECK-NEXT:    store float [[V_EXTRACT2]], float* [[O2:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %v = shufflevector <16 x i8> %w, <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %f = bitcast <4 x i8> %v to float
+  %i = bitcast <4 x i8> %v to i32
+  store i32 %i, i32* %o1, align 4
+  store float %f, float* %o2, align 4
+  ret void
+}
+
+; Shuffle-of-bitcast-splat --> splat-bitcast
+
+define <4 x i16> @splat_bitcast_operand(<8 x i8> %x) {
+; CHECK-LABEL: @splat_bitcast_operand(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <8 x i8> [[S1]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[S2]]
+;
+  %s1 = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bc = bitcast <8 x i8> %s1 to <4 x i16>
+  %s2 = shufflevector <4 x i16> %bc, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 0>
+  ret <4 x i16> %s2
+}
+
+; Shuffle-of-bitcast-splat --> splat-bitcast
+
+define <4 x i16> @splat_bitcast_operand_uses(<8 x i8> %x) {
+; CHECK-LABEL: @splat_bitcast_operand_uses(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i8> [[S1]] to <4 x i16>
+; CHECK-NEXT:    call void @use(<4 x i16> [[BC]])
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <8 x i8> [[S1]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[S2]]
+;
+  %s1 = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bc = bitcast <8 x i8> %s1 to <4 x i16>
+  call void @use(<4 x i16> %bc)
+  %s2 = shufflevector <4 x i16> %bc, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 0>
+  ret <4 x i16> %s2
+}
+
+; Shuffle-of-bitcast-splat --> splat-bitcast
+
+define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
+; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[S2]]
+;
+  %s1 = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %bc = bitcast <4 x float> %s1 to <4 x i32>
+  %s2 = shufflevector <4 x i32> %bc, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 0>
+  ret <4 x i32> %s2
+}
+
+; Scaled mask is inverse of first mask.
+
+define <4 x i32> @shuf_bitcast_operand(<16 x i8> %x) {
+; CHECK-LABEL: @shuf_bitcast_operand(
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[S2]]
+;
+  %s1 = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc = bitcast <16 x i8> %s1 to <4 x i32>
+  %s2 = shufflevector <4 x i32> %bc, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %s2
+}
+
+; TODO: Could allow fold for length-changing shuffles.
+
+define <5 x i16> @splat_bitcast_operand_change_type(<8 x i8> %x) {
+; CHECK-LABEL: @splat_bitcast_operand_change_type(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i8> [[S1]] to <4 x i16>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <4 x i16> [[BC]], <4 x i16> poison, <5 x i32> <i32 0, i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    ret <5 x i16> [[S2]]
+;
+  %s1 = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bc = bitcast <8 x i8> %s1 to <4 x i16>
+  %s2 = shufflevector <4 x i16> %bc, <4 x i16> poison, <5 x i32> <i32 0, i32 2, i32 1, i32 0, i32 3>
+  ret <5 x i16> %s2
+}
+
+; Shuffle-of-bitcast-splat --> splat-bitcast
+
+define <4 x i16> @splat_bitcast_operand_wider_src_elt(<2 x i32> %x) {
+; CHECK-LABEL: @splat_bitcast_operand_wider_src_elt(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <2 x i32> [[S1]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[S2]]
+;
+  %s1 = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+  %bc = bitcast <2 x i32> %s1 to <4 x i16>
+  %s2 = shufflevector <4 x i16> %bc, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i16> %s2
+}
+
+; Shuffle-of-bitcast-splat --> splat-bitcast
+
+define <4 x i16> @splat_bitcast_operand_wider_src_elt_uses(<2 x i32> %x) {
+; CHECK-LABEL: @splat_bitcast_operand_wider_src_elt_uses(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[S1]] to <4 x i16>
+; CHECK-NEXT:    call void @use(<4 x i16> [[BC]])
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <2 x i32> [[S1]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[S2]]
+;
+  %s1 = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+  %bc = bitcast <2 x i32> %s1 to <4 x i16>
+  call void @use(<4 x i16> %bc)
+  %s2 = shufflevector <4 x i16> %bc, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i16> %s2
+}
+
+; Scaled mask is inverse of first mask.
+
+define <16 x i8> @shuf_bitcast_operand_wider_src(<4 x i32> %x) {
+; CHECK-LABEL: @shuf_bitcast_operand_wider_src(
+; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x i32> [[X:%.*]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[S2]]
+;
+  %s1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc = bitcast <4 x i32> %s1 to <16 x i8>
+  %s2 = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i8> %s2
+}
+
+; Negative test - the 2nd mask can't be widened
+
+define <16 x i8> @shuf_bitcast_operand_cannot_widen(<4 x i32> %x) {
+; CHECK-LABEL: @shuf_bitcast_operand_cannot_widen(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[S1]] to <16 x i8>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[BC]], <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 12, i32 13, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <16 x i8> [[S2]]
+;
+  %s1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc = bitcast <4 x i32> %s1 to <16 x i8>
+  %s2 = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 12, i32 13, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i8> %s2
+}
+
+; Negative test - the 2nd mask can't be widened
+
+define <16 x i8> @shuf_bitcast_operand_cannot_widen_undef(<4 x i32> %x) {
+; CHECK-LABEL: @shuf_bitcast_operand_cannot_widen_undef(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[S1]] to <16 x i8>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[BC]], <16 x i8> poison, <16 x i32> <i32 12, i32 undef, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <16 x i8> [[S2]]
+;
+  %s1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc = bitcast <4 x i32> %s1 to <16 x i8>
+  %s2 = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> <i32 12, i32 undef, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i8> %s2
+}

diff  --git a/llvm/test/Transforms/InstCombine/shufflevec-constant-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shufflevec-constant-inseltpoison.ll
new file mode 100644
index 000000000000..f2850569c266
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shufflevec-constant-inseltpoison.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+
+define <4 x float> @__inff4() nounwind readnone {
+; CHECK-LABEL: @__inff4(
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0x7FF0000000000000, float 0x7FF0000000000000>
+;
+  %tmp14 = extractelement <1 x double> bitcast (<2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000> to <1 x double>), i32 0
+  %tmp4 = bitcast double %tmp14 to i64
+  %tmp3 = bitcast i64 %tmp4 to <2 x float>
+  %tmp8 = shufflevector <2 x float> %tmp3, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp9 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp8, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %tmp9
+}

diff  --git a/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
index 526048e0fc91..8098ac333b75 100644
--- a/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
@@ -16,7 +16,7 @@ define i16 @test_srem_orig(i16 %a, i1 %cmp) {
 ; CHECK-NEXT:    ret i16 [[T3]]
 ;
   %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
-  %splat = shufflevector <2 x i16> %splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i16> %splatinsert, <2 x i16> poison, <2 x i32> zeroinitializer
   %t1 = select i1 %cmp, <2 x i16> <i16 1, i16 1>, <2 x i16> %splat
   %t2 = srem <2 x i16> %t1, <i16 2, i16 2>
   %t3 = extractelement <2 x i16> %t2, i32 1
@@ -31,13 +31,13 @@ define <2 x i16> @test_srem(i16 %a, i1 %cmp) {
 ; CHECK-LABEL: @test_srem(
 ; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[T1:%.*]] = srem <2 x i16> [[SPLATINSERT]], <i16 2, i16 1>
-; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
 ; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
 ; CHECK-NEXT:    ret <2 x i16> [[T2]]
 ;
   %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
   %t1 = srem <2 x i16> %splatinsert, <i16 2, i16 1>
-  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
   ret <2 x i16> %t2
 }
@@ -46,13 +46,13 @@ define <2 x i16> @test_urem(i16 %a, i1 %cmp) {
 ; CHECK-LABEL: @test_urem(
 ; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[T1:%.*]] = urem <2 x i16> [[SPLATINSERT]], <i16 3, i16 1>
-; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
 ; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
 ; CHECK-NEXT:    ret <2 x i16> [[T2]]
 ;
   %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
   %t1 = urem <2 x i16> %splatinsert, <i16 3, i16 1>
-  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
   ret <2 x i16> %t2
 }
@@ -61,13 +61,13 @@ define <2 x i16> @test_sdiv(i16 %a, i1 %cmp) {
 ; CHECK-LABEL: @test_sdiv(
 ; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[T1:%.*]] = sdiv <2 x i16> [[SPLATINSERT]], <i16 2, i16 1>
-; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
 ; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
 ; CHECK-NEXT:    ret <2 x i16> [[T2]]
 ;
   %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
   %t1 = sdiv <2 x i16> %splatinsert, <i16 2, i16 1>
-  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
   ret <2 x i16> %t2
 }
@@ -76,13 +76,13 @@ define <2 x i16> @test_udiv(i16 %a, i1 %cmp) {
 ; CHECK-LABEL: @test_udiv(
 ; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[T1:%.*]] = udiv <2 x i16> [[SPLATINSERT]], <i16 3, i16 1>
-; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
 ; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
 ; CHECK-NEXT:    ret <2 x i16> [[T2]]
 ;
   %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
   %t1 = udiv <2 x i16> %splatinsert, <i16 3, i16 1>
-  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
   ret <2 x i16> %t2
 }
@@ -99,7 +99,7 @@ define <2 x float> @test_fdiv(float %a, float %b, i1 %cmp) {
   %splatinsert = insertelement <2 x float> poison, float %a, i32 0
   %denom = insertelement <2 x float> <float 3.0, float undef>, float 1.0, i32 1
   %t1 = fdiv <2 x float> %splatinsert, %denom
-  %splat.op = shufflevector <2 x float> %t1, <2 x float> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x float> %t1, <2 x float> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x float> <float 77.0, float 99.0>, <2 x float> %splat.op
   ret <2 x float> %t2
 }
@@ -116,7 +116,7 @@ define <2 x float> @test_frem(float %a, float %b, i1 %cmp) {
   %splatinsert = insertelement <2 x float> poison, float %a, i32 0
   %denom = insertelement <2 x float> <float 3.0, float undef>, float 1.0, i32 1
   %t1 = frem <2 x float> %splatinsert, %denom
-  %splat.op = shufflevector <2 x float> %t1, <2 x float> undef, <2 x i32> <i32 undef, i32 0>
+  %splat.op = shufflevector <2 x float> %t1, <2 x float> poison, <2 x i32> <i32 undef, i32 0>
   %t2 = select i1 %cmp, <2 x float> <float 77.0, float 99.0>, <2 x float> %splat.op
   ret <2 x float> %t2
 }

diff  --git a/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll
new file mode 100644
index 000000000000..b80b7798a855
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll
@@ -0,0 +1,1406 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare void @use4(i4)
+declare void @use8(i8)
+declare void @use_v2i4(<2 x i4>)
+declare i1 @use32gen1(i32)
+
+; Constant can be freely negated.
+define i8 @t0(i8 %x) {
+; CHECK-LABEL: @t0(
+; CHECK-NEXT:    [[T0:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    ret i8 [[T0]]
+;
+  %t0 = sub i8 %x, -42
+  ret i8 %t0
+}
+
+; Negation can be negated for free
+define i8 @t1(i8 %x, i8 %y) {
+; CHECK-LABEL: @t1(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+
+; Shift-left can be negated if all uses can be updated
+define i8 @t2(i8 %x, i8 %y) {
+; CHECK-LABEL: @t2(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl i8 42, [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = shl i8 -42, %y
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @n2(i8 %x, i8 %y) {
+; CHECK-LABEL: @n2(
+; CHECK-NEXT:    [[T0:%.*]] = shl i8 -42, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = shl i8 -42, %y
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @t3(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @t3(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = shl i8 [[Z]], [[Y:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, %y
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n3(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @n3(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, %y
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+; Select can be negated if all it's operands can be negated and all the users of select can be updated
+define i8 @t4(i8 %x, i1 %y) {
+; CHECK-LABEL: @t4(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = select i1 [[Y:%.*]], i8 42, i8 -44
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = select i1 %y, i8 -42, i8 44
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @n4(i8 %x, i1 %y) {
+; CHECK-LABEL: @n4(
+; CHECK-NEXT:    [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = select i1 %y, i8 -42, i8 44
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @n5(i8 %x, i1 %y, i8 %z) {
+; CHECK-LABEL: @n5(
+; CHECK-NEXT:    [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[Z:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = select i1 %y, i8 -42, i8 %z
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @t6(i8 %x, i1 %y, i8 %z) {
+; CHECK-LABEL: @t6(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = select i1 [[Y:%.*]], i8 42, i8 [[Z]]
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %z
+  call void @use8(i8 %t0)
+  %t1 = select i1 %y, i8 -42, i8 %t0
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @t7(i8 %x, i1 %y, i8 %z) {
+; CHECK-LABEL: @t7(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl i8 -1, [[Z:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0_NEG]]
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i8 1, %z
+  %t1 = select i1 %y, i8 0, i8 %t0
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n8(i8 %x, i1 %y, i8 %z) {
+; CHECK-LABEL: @n8(
+; CHECK-NEXT:    [[T0:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i8 1, %z
+  call void @use8(i8 %t0)
+  %t1 = select i1 %y, i8 0, i8 %t0
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+; Subtraction can be negated by swapping its operands.
+; x - (y - z) -> x - y + z -> x + (z - y)
+define i8 @t9(i8 %x, i8 %y) {
+; CHECK-LABEL: @t9(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T0_NEG]]
+;
+  %t0 = sub i8 %y, %x
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
+}
+
+define i8 @n10(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @n10(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sub i8 %y, %x
+  call void @use8(i8 %t0)
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
+}
+
+define i8 @neg_of_sub_from_constant(i8 %x) {
+; CHECK-LABEL: @neg_of_sub_from_constant(
+; CHECK-NEXT:    [[S_NEG:%.*]] = add i8 [[X:%.*]], -42
+; CHECK-NEXT:    ret i8 [[S_NEG]]
+;
+  %s = sub i8 42, %x
+  %r = sub i8 0, %s
+  ret i8 %r
+}
+
+define i8 @neg_of_sub_from_constant_multi_use(i8 %x) {
+; CHECK-LABEL: @neg_of_sub_from_constant_multi_use(
+; CHECK-NEXT:    [[S_NEG:%.*]] = add i8 [[X:%.*]], -42
+; CHECK-NEXT:    [[S:%.*]] = sub i8 42, [[X]]
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    ret i8 [[S_NEG]]
+;
+  %s = sub i8 42, %x
+  call void @use8(i8 %s)
+  %r = sub i8 0, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_constant_of_sub_from_constant(i8 %x) {
+; CHECK-LABEL: @sub_from_constant_of_sub_from_constant(
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[X:%.*]], -31
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = sub i8 42, %x
+  %r = sub i8 11, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_constant_of_sub_from_constant_multi_use(i8 %x) {
+; CHECK-LABEL: @sub_from_constant_of_sub_from_constant_multi_use(
+; CHECK-NEXT:    [[S:%.*]] = sub i8 42, [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[X]], -31
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = sub i8 42, %x
+  call void @use8(i8 %s)
+  %r = sub i8 11, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_variable_of_sub_from_constant(i8 %x, i8 %y) {
+; CHECK-LABEL: @sub_from_variable_of_sub_from_constant(
+; CHECK-NEXT:    [[S_NEG:%.*]] = add i8 [[X:%.*]], -42
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[S_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = sub i8 42, %x
+  %r = sub i8 %y, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_variable_of_sub_from_constant_multi_use(i8 %x, i8 %y) {
+; CHECK-LABEL: @sub_from_variable_of_sub_from_constant_multi_use(
+; CHECK-NEXT:    [[S:%.*]] = sub i8 42, [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y:%.*]], [[S]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = sub i8 42, %x
+  call void @use8(i8 %s)
+  %r = sub i8 %y, %s
+  ret i8 %r
+}
+
+; Addition can be negated if both operands can be negated
+; x - (y + z) -> x - y - z -> x + ((-y) + (-z)))
+define i8 @t12(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @t12(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[Y]], [[Z]]
+; CHECK-NEXT:    [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = sub i8 0, %z
+  call void @use8(i8 %t1)
+  %t2 = add i8 %t0, %t1
+  %t3 = sub i8 %x, %t2
+  ret i8 %t3
+}
+define i8 @n13(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @n13(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = sub i8 [[Y]], [[Z:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = add i8 %t0, %z
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n14(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @n14(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[Y]], [[Z]]
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 0, [[TMP1]]
+; CHECK-NEXT:    call void @use8(i8 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = sub i8 0, %z
+  call void @use8(i8 %t1)
+  %t2 = add i8 %t0, %t1
+  call void @use8(i8 %t2)
+  %t3 = sub i8 %x, %t2
+  ret i8 %t3
+}
+
+define i8 @neg_of_add_with_constant(i8 %x) {
+; CHECK-LABEL: @neg_of_add_with_constant(
+; CHECK-NEXT:    [[R:%.*]] = sub i8 -42, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  %r = sub i8 0, %s
+  ret i8 %r
+}
+
+define i8 @neg_of_add_with_constant_multi_use(i8 %x) {
+; CHECK-LABEL: @neg_of_add_with_constant_multi_use(
+; CHECK-NEXT:    [[S:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 -42, [[X]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  call void @use8(i8 %s)
+  %r = sub i8 0, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_constant_of_add_with_constant(i8 %x) {
+; CHECK-LABEL: @sub_from_constant_of_add_with_constant(
+; CHECK-NEXT:    [[R:%.*]] = sub i8 -31, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  %r = sub i8 11, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_constant_of_add_with_constant_multi_use(i8 %x) {
+; CHECK-LABEL: @sub_from_constant_of_add_with_constant_multi_use(
+; CHECK-NEXT:    [[S:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 -31, [[X]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  call void @use8(i8 %s)
+  %r = sub i8 11, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_variable_of_add_with_constant(i8 %x, i8 %y) {
+; CHECK-LABEL: @sub_from_variable_of_add_with_constant(
+; CHECK-NEXT:    [[S:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y:%.*]], [[S]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  %r = sub i8 %y, %s
+  ret i8 %r
+}
+
+define i8 @sub_from_variable_of_add_with_constant_multi_use(i8 %x, i8 %y) {
+; CHECK-LABEL: @sub_from_variable_of_add_with_constant_multi_use(
+; CHECK-NEXT:    [[S:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y:%.*]], [[S]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %s = add i8 %x, 42
+  call void @use8(i8 %s)
+  %r = sub i8 %y, %s
+  ret i8 %r
+}
+
+; Multiplication can be negated if either one of operands can be negated
+; x - (y * z) -> x + ((-y) * z) or  x + ((-z) * y)
+define i8 @t15(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @t15(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = mul i8 [[Y]], [[Z:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = mul i8 %t0, %z
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n16(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @n16(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = mul i8 %t0, %z
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+; Phi can be negated if all incoming values can be negated
+define i8 @t16(i1 %c, i8 %x) {
+; CHECK-LABEL: @t16(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z_NEG:%.*]] = phi i8 [ [[X:%.*]], [[THEN]] ], [ 42, [[ELSE]] ]
+; CHECK-NEXT:    ret i8 [[Z_NEG]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %y = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %z = phi i8 [ %y, %then], [ -42, %else ]
+  %n = sub i8 0, %z
+  ret i8 %n
+}
+define i8 @n17(i1 %c, i8 %x) {
+; CHECK-LABEL: @n17(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[Y:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Y]], [[THEN]] ], [ -42, [[ELSE]] ]
+; CHECK-NEXT:    call void @use8(i8 [[Z]])
+; CHECK-NEXT:    [[N:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT:    ret i8 [[N]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %y = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %z = phi i8 [ %y, %then], [ -42, %else ]
+  call void @use8(i8 %z)
+  %n = sub i8 0, %z
+  ret i8 %n
+}
+define i8 @n19(i1 %c, i8 %x, i8 %y) {
+; CHECK-LABEL: @n19(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[Z:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[Z]], [[THEN]] ], [ [[Y:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[N:%.*]] = sub i8 0, [[R]]
+; CHECK-NEXT:    ret i8 [[N]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %z = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %r = phi i8 [ %z, %then], [ %y, %else ]
+  %n = sub i8 0, %r
+  ret i8 %n
+}
+define void @phi_with_duplicate_incoming_basic_blocks(i32 %x, i32 %y, i1 %should_lookup, i32 %z) {
+; CHECK-LABEL: @phi_with_duplicate_incoming_basic_blocks(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_INC_NEG:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    br i1 [[SHOULD_LOOKUP:%.*]], label [[LOOKUP:%.*]], label [[LOOP:%.*]]
+; CHECK:       lookup:
+; CHECK-NEXT:    [[TO_LOOKUP:%.*]] = phi i32 [ [[Y:%.*]], [[ENTRY:%.*]] ], [ [[METAVAL_NEG:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    switch i32 [[TO_LOOKUP]], label [[END:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP]]
+; CHECK-NEXT:    i32 42, label [[LOOP]]
+; CHECK-NEXT:    ]
+; CHECK:       loop:
+; CHECK-NEXT:    [[METAVAL_NEG]] = phi i32 [ [[X_INC_NEG]], [[LOOKUP]] ], [ [[X_INC_NEG]], [[LOOKUP]] ], [ -84, [[ENTRY]] ]
+; CHECK-NEXT:    [[REPEAT:%.*]] = call i1 @use32gen1(i32 [[METAVAL_NEG]])
+; CHECK-NEXT:    br i1 [[REPEAT]], label [[LOOKUP]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x_inc = add i32 %x, 1
+  br i1 %should_lookup, label %lookup, label %loop
+
+lookup:
+  %to_lookup = phi i32 [ %y, %entry ], [ %negated_metaval, %loop ]
+  switch i32 %to_lookup, label %end [
+  i32 0, label %loop
+  i32 42, label %loop
+  ]
+
+loop:
+  %metaval = phi i32 [ %x_inc, %lookup ], [ %x_inc, %lookup ], [ 84, %entry ]
+  %negated_metaval = sub i32 0, %metaval
+  %repeat = call i1 @use32gen1(i32 %negated_metaval)
+  br i1 %repeat, label %lookup, label %end
+
+end:
+  ret void
+}
+
+; truncation can be negated if it's operand can be negated
+define i8 @t20(i8 %x, i16 %y) {
+; CHECK-LABEL: @t20(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl i16 42, [[Y:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = trunc i16 [[T0_NEG]] to i8
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i16 -42, %y
+  %t1 = trunc i16 %t0 to i8
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n21(i8 %x, i16 %y) {
+; CHECK-LABEL: @n21(
+; CHECK-NEXT:    [[T0:%.*]] = shl i16 -42, [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = trunc i16 [[T0]] to i8
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i16 -42, %y
+  %t1 = trunc i16 %t0 to i8
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+define i4 @negate_xor(i4 %x) {
+; CHECK-LABEL: @negate_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i4 [[X:%.*]], -6
+; CHECK-NEXT:    [[O_NEG:%.*]] = add i4 [[TMP1]], 1
+; CHECK-NEXT:    ret i4 [[O_NEG]]
+;
+  %o = xor i4 %x, 5
+  %r = sub i4 0, %o
+  ret i4 %r
+}
+
+define <2 x i4> @negate_xor_vec(<2 x i4> %x) {
+; CHECK-LABEL: @negate_xor_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i4> [[X:%.*]], <i4 -6, i4 5>
+; CHECK-NEXT:    [[O_NEG:%.*]] = add <2 x i4> [[TMP1]], <i4 1, i4 1>
+; CHECK-NEXT:    ret <2 x i4> [[O_NEG]]
+;
+  %o = xor <2 x i4> %x, <i4 5, i4 10>
+  %r = sub <2 x i4> zeroinitializer, %o
+  ret <2 x i4> %r
+}
+
+define i8 @negate_xor_use(i8 %x) {
+; CHECK-LABEL: @negate_xor_use(
+; CHECK-NEXT:    [[O:%.*]] = xor i8 [[X:%.*]], 5
+; CHECK-NEXT:    call void @use8(i8 [[O]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 0, [[O]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %o = xor i8 %x, 5
+  call void @use8(i8 %o)
+  %r = sub i8 0, %o
+  ret i8 %r
+}
+
+define i4 @negate_shl_xor(i4 %x, i4 %y) {
+; CHECK-LABEL: @negate_shl_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i4 [[X:%.*]], -6
+; CHECK-NEXT:    [[O_NEG:%.*]] = add i4 [[TMP1]], 1
+; CHECK-NEXT:    [[S_NEG:%.*]] = shl i4 [[O_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret i4 [[S_NEG]]
+;
+  %o = xor i4 %x, 5
+  %s = shl i4 %o, %y
+  %r = sub i4 0, %s
+  ret i4 %r
+}
+
+define i8 @negate_shl_not_uses(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_shl_not_uses(
+; CHECK-NEXT:    [[O_NEG:%.*]] = add i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[O:%.*]] = xor i8 [[X]], -1
+; CHECK-NEXT:    call void @use8(i8 [[O]])
+; CHECK-NEXT:    [[S_NEG:%.*]] = shl i8 [[O_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[S_NEG]]
+;
+  %o = xor i8 %x, -1
+  call void @use8(i8 %o)
+  %s = shl i8 %o, %y
+  %r = sub i8 0, %s
+  ret i8 %r
+}
+
+define <2 x i4> @negate_mul_not_uses_vec(<2 x i4> %x, <2 x i4> %y) {
+; CHECK-LABEL: @negate_mul_not_uses_vec(
+; CHECK-NEXT:    [[O_NEG:%.*]] = add <2 x i4> [[X:%.*]], <i4 1, i4 1>
+; CHECK-NEXT:    [[O:%.*]] = xor <2 x i4> [[X]], <i4 -1, i4 -1>
+; CHECK-NEXT:    call void @use_v2i4(<2 x i4> [[O]])
+; CHECK-NEXT:    [[S_NEG:%.*]] = mul <2 x i4> [[O_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i4> [[S_NEG]]
+;
+  %o = xor <2 x i4> %x, <i4 -1, i4 -1>
+  call void @use_v2i4(<2 x i4> %o)
+  %s = mul <2 x i4> %o, %y
+  %r = sub <2 x i4> zeroinitializer, %s
+  ret <2 x i4> %r
+}
+
+; signed division can be negated if divisor can be negated and is not 1/-1
+define i8 @negate_sdiv(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_sdiv(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sdiv i8 [[Y:%.*]], -42
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sdiv i8 %y, 42
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_sdiv_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_sdiv_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sdiv i8 [[Y:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sdiv i8 %y, 42
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_sdiv_extrause2(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_sdiv_extrause2(
+; CHECK-NEXT:    [[T0:%.*]] = sdiv i8 [[Y:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub nsw i8 0, [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sdiv i8 %y, 42
+  call void @use8(i8 %t0)
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
+}
+
+; Right-shift sign bit smear is negatible.
+define i8 @negate_ashr(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_ashr(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = lshr i8 [[Y:%.*]], 7
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = ashr i8 %y, 7
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_lshr(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_lshr(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = ashr i8 [[Y:%.*]], 7
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = lshr i8 %y, 7
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_ashr_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_ashr_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = ashr i8 [[Y:%.*]], 7
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = ashr i8 %y, 7
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_lshr_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_lshr_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = lshr i8 [[Y:%.*]], 7
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = lshr i8 %y, 7
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_ashr_wrongshift(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_ashr_wrongshift(
+; CHECK-NEXT:    [[T0:%.*]] = ashr i8 [[Y:%.*]], 6
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = ashr i8 %y, 6
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_lshr_wrongshift(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_lshr_wrongshift(
+; CHECK-NEXT:    [[T0:%.*]] = lshr i8 [[Y:%.*]], 6
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = lshr i8 %y, 6
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+
+; *ext of i1 is always negatible
+define i8 @negate_sext(i8 %x, i1 %y) {
+; CHECK-LABEL: @negate_sext(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sext i1 %y to i8
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_zext(i8 %x, i1 %y) {
+; CHECK-LABEL: @negate_zext(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = zext i1 %y to i8
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_sext_extrause(i8 %x, i1 %y) {
+; CHECK-LABEL: @negate_sext_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sext i1 %y to i8
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_zext_extrause(i8 %x, i1 %y) {
+; CHECK-LABEL: @negate_zext_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = zext i1 %y to i8
+  call void @use8(i8 %t0)
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_sext_wrongwidth(i8 %x, i2 %y) {
+; CHECK-LABEL: @negate_sext_wrongwidth(
+; CHECK-NEXT:    [[T0:%.*]] = sext i2 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = sext i2 %y to i8
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+define i8 @negate_zext_wrongwidth(i8 %x, i2 %y) {
+; CHECK-LABEL: @negate_zext_wrongwidth(
+; CHECK-NEXT:    [[T0:%.*]] = zext i2 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = zext i2 %y to i8
+  %t1 = sub i8 %x, %t0
+  ret i8 %t1
+}
+
+define <2 x i4> @negate_shufflevector_oneinput_reverse(<2 x i4> %x, <2 x i4> %y) {
+; CHECK-LABEL: @negate_shufflevector_oneinput_reverse(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl <2 x i4> <i4 6, i4 -5>, [[X:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = shufflevector <2 x i4> [[T0_NEG]], <2 x i4> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = add <2 x i4> [[T1_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i4> [[T2]]
+;
+  %t0 = shl <2 x i4> <i4 -6, i4 5>, %x
+  %t1 = shufflevector <2 x i4> %t0, <2 x i4> poison, <2 x i32> <i32 1, i32 0>
+  %t2 = sub <2 x i4> %y, %t1
+  ret <2 x i4> %t2
+}
+define <2 x i4> @negate_shufflevector_oneinput_second_lane_is_undef(<2 x i4> %x, <2 x i4> %y) {
+; CHECK-LABEL: @negate_shufflevector_oneinput_second_lane_is_undef(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl <2 x i4> <i4 6, i4 -5>, [[X:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = shufflevector <2 x i4> [[T0_NEG]], <2 x i4> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[T2:%.*]] = add <2 x i4> [[T1_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i4> [[T2]]
+;
+  %t0 = shl <2 x i4> <i4 -6, i4 5>, %x
+  %t1 = shufflevector <2 x i4> %t0, <2 x i4> poison, <2 x i32> <i32 0, i32 2>
+  %t2 = sub <2 x i4> %y, %t1
+  ret <2 x i4> %t2
+}
+define <2 x i4> @negate_shufflevector_twoinputs(<2 x i4> %x, <2 x i4> %y, <2 x i4> %z) {
+; CHECK-LABEL: @negate_shufflevector_twoinputs(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = shl <2 x i4> <i4 6, i4 -5>, [[X:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = add <2 x i4> [[Y:%.*]], <i4 poison, i4 1>
+; CHECK-NEXT:    [[T2_NEG:%.*]] = shufflevector <2 x i4> [[T0_NEG]], <2 x i4> [[T1_NEG]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = add <2 x i4> [[T2_NEG]], [[Z:%.*]]
+; CHECK-NEXT:    ret <2 x i4> [[T3]]
+;
+  %t0 = shl <2 x i4> <i4 -6, i4 5>, %x
+  %t1 = xor <2 x i4> %y, <i4 -1, i4 -1>
+  %t2 = shufflevector <2 x i4> %t0, <2 x i4> %t1, <2 x i32> <i32 0, i32 3>
+  %t3 = sub <2 x i4> %z, %t2
+  ret <2 x i4> %t3
+}
+define <2 x i4> @negate_shufflevector_oneinput_extrause(<2 x i4> %x, <2 x i4> %y) {
+; CHECK-LABEL: @negate_shufflevector_oneinput_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = shl <2 x i4> <i4 -6, i4 5>, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i4> [[T0]], <2 x i4> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    call void @use_v2i4(<2 x i4> [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub <2 x i4> [[Y:%.*]], [[T1]]
+; CHECK-NEXT:    ret <2 x i4> [[T2]]
+;
+  %t0 = shl <2 x i4> <i4 -6, i4 5>, %x
+  %t1 = shufflevector <2 x i4> %t0, <2 x i4> poison, <2 x i32> <i32 1, i32 0>
+  call void @use_v2i4(<2 x i4> %t1)
+  %t2 = sub <2 x i4> %y, %t1
+  ret <2 x i4> %t2
+}
+
+; zext of non-negative can be negated
+; sext of non-positive can be negated
+define i16 @negation_of_zeroext_of_nonnegative(i8 %x) {
+; CHECK-LABEL: @negation_of_zeroext_of_nonnegative(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp sgt i8 [[T0]], -1
+; CHECK-NEXT:    br i1 [[T1]], label [[NONNEG_BB:%.*]], label [[NEG_BB:%.*]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = zext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp sge i8 %t0, 0
+  br i1 %t1, label %nonneg_bb, label %neg_bb
+
+nonneg_bb:
+  %t2 = zext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+neg_bb:
+  ret i16 0
+}
+define i16 @negation_of_zeroext_of_positive(i8 %x) {
+; CHECK-LABEL: @negation_of_zeroext_of_positive(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp sgt i8 [[T0]], 0
+; CHECK-NEXT:    br i1 [[T1]], label [[NONNEG_BB:%.*]], label [[NEG_BB:%.*]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = zext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp sgt i8 %t0, 0
+  br i1 %t1, label %nonneg_bb, label %neg_bb
+
+nonneg_bb:
+  %t2 = zext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+neg_bb:
+  ret i16 0
+}
+define i16 @negation_of_signext_of_negative(i8 %x) {
+; CHECK-LABEL: @negation_of_signext_of_negative(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp slt i8 [[T0]], 0
+; CHECK-NEXT:    br i1 [[T1]], label [[NEG_BB:%.*]], label [[NONNEG_BB:%.*]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = sext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp slt i8 %t0, 0
+  br i1 %t1, label %neg_bb, label %nonneg_bb
+
+neg_bb:
+  %t2 = sext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+nonneg_bb:
+  ret i16 0
+}
+define i16 @negation_of_signext_of_nonpositive(i8 %x) {
+; CHECK-LABEL: @negation_of_signext_of_nonpositive(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp slt i8 [[T0]], 1
+; CHECK-NEXT:    br i1 [[T1]], label [[NEG_BB:%.*]], label [[NONNEG_BB:%.*]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = sext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp sle i8 %t0, 0
+  br i1 %t1, label %neg_bb, label %nonneg_bb
+
+neg_bb:
+  %t2 = sext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+nonneg_bb:
+  ret i16 0
+}
+define i16 @negation_of_signext_of_nonnegative__wrong_cast(i8 %x) {
+; CHECK-LABEL: @negation_of_signext_of_nonnegative__wrong_cast(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp sgt i8 [[T0]], -1
+; CHECK-NEXT:    br i1 [[T1]], label [[NONNEG_BB:%.*]], label [[NEG_BB:%.*]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = sext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp sge i8 %t0, 0
+  br i1 %t1, label %nonneg_bb, label %neg_bb
+
+nonneg_bb:
+  %t2 = sext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+neg_bb:
+  ret i16 0
+}
+define i16 @negation_of_zeroext_of_negative_wrongcast(i8 %x) {
+; CHECK-LABEL: @negation_of_zeroext_of_negative_wrongcast(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = icmp slt i8 [[T0]], 0
+; CHECK-NEXT:    br i1 [[T1]], label [[NEG_BB:%.*]], label [[NONNEG_BB:%.*]]
+; CHECK:       neg_bb:
+; CHECK-NEXT:    [[T2:%.*]] = zext i8 [[T0]] to i16
+; CHECK-NEXT:    [[T3:%.*]] = sub nsw i16 0, [[T2]]
+; CHECK-NEXT:    ret i16 [[T3]]
+; CHECK:       nonneg_bb:
+; CHECK-NEXT:    ret i16 0
+;
+  %t0 = sub i8 0, %x
+  %t1 = icmp slt i8 %t0, 0
+  br i1 %t1, label %neg_bb, label %nonneg_bb
+
+neg_bb:
+  %t2 = zext i8 %t0 to i16
+  %t3 = sub i16 0, %t2
+  ret i16 %t3
+
+nonneg_bb:
+  ret i16 0
+}
+
+; 'or' of 1 and operand with no lowest bit set is 'inc'
+define i8 @negation_of_increment_via_or_with_no_common_bits_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @negation_of_increment_via_or_with_no_common_bits_set(
+; CHECK-NEXT:    [[T0:%.*]] = shl i8 [[Y:%.*]], 1
+; CHECK-NEXT:    [[T1_NEG:%.*]] = xor i8 [[T0]], -1
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i8 %y, 1
+  %t1 = or i8 %t0, 1
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @negation_of_increment_via_or_with_no_common_bits_set_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @negation_of_increment_via_or_with_no_common_bits_set_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = shl i8 [[Y:%.*]], 1
+; CHECK-NEXT:    [[T1:%.*]] = or i8 [[T0]], 1
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i8 %y, 1
+  %t1 = or i8 %t0, 1
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @negation_of_increment_via_or_common_bits_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @negation_of_increment_via_or_common_bits_set(
+; CHECK-NEXT:    [[T0:%.*]] = shl i8 [[Y:%.*]], 1
+; CHECK-NEXT:    [[T1:%.*]] = or i8 [[T0]], 3
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i8 %y, 1
+  %t1 = or i8 %t0, 3
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+; 'or' of operands with no common bits set is 'add'
+define i8 @add_via_or_with_no_common_bits_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @add_via_or_with_no_common_bits_set(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = shl i8 [[Y]], 2
+; CHECK-NEXT:    [[T2_NEG:%.*]] = add i8 [[T1_NEG]], -3
+; CHECK-NEXT:    [[T3:%.*]] = add i8 [[T2_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 2
+  %t2 = or i8 %t1, 3
+  %t3 = sub i8 %x, %t2
+  ret i8 %t3
+}
+define i8 @add_via_or_with_common_bit_maybe_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @add_via_or_with_common_bit_maybe_set(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = shl i8 [[T0]], 2
+; CHECK-NEXT:    [[T2:%.*]] = or i8 [[T1]], 4
+; CHECK-NEXT:    [[T3:%.*]] = sub i8 [[X:%.*]], [[T2]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 2
+  %t2 = or i8 %t1, 4
+  %t3 = sub i8 %x, %t2
+  ret i8 %t3
+}
+define i8 @add_via_or_with_no_common_bits_set_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @add_via_or_with_no_common_bits_set_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = shl i8 [[T0]], 2
+; CHECK-NEXT:    [[T2:%.*]] = or i8 [[T1]], 3
+; CHECK-NEXT:    call void @use8(i8 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = sub i8 [[X:%.*]], [[T2]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %y
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 2
+  %t2 = or i8 %t1, 3
+  call void @use8(i8 %t2)
+  %t3 = sub i8 %x, %t2
+  ret i8 %t3
+}
+
+; `extractelement` is negatible if source operand is negatible.
+define i4 @negate_extractelement(<2 x i4> %x, i32 %y, i4 %z) {
+; CHECK-LABEL: @negate_extractelement(
+; CHECK-NEXT:    [[T0:%.*]] = sub <2 x i4> zeroinitializer, [[X:%.*]]
+; CHECK-NEXT:    call void @use_v2i4(<2 x i4> [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = extractelement <2 x i4> [[X]], i32 [[Y:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = add i4 [[T1_NEG]], [[Z:%.*]]
+; CHECK-NEXT:    ret i4 [[T2]]
+;
+  %t0 = sub <2 x i4> zeroinitializer, %x
+  call void @use_v2i4(<2 x i4> %t0)
+  %t1 = extractelement <2 x i4> %t0, i32 %y
+  %t2 = sub i4 %z, %t1
+  ret i4 %t2
+}
+define i4 @negate_extractelement_extrause(<2 x i4> %x, i32 %y, i4 %z) {
+; CHECK-LABEL: @negate_extractelement_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub <2 x i4> zeroinitializer, [[X:%.*]]
+; CHECK-NEXT:    call void @use_v2i4(<2 x i4> [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = extractelement <2 x i4> [[T0]], i32 [[Y:%.*]]
+; CHECK-NEXT:    call void @use4(i4 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i4 [[Z:%.*]], [[T1]]
+; CHECK-NEXT:    ret i4 [[T2]]
+;
+  %t0 = sub <2 x i4> zeroinitializer, %x
+  call void @use_v2i4(<2 x i4> %t0)
+  %t1 = extractelement <2 x i4> %t0, i32 %y
+  call void @use4(i4 %t1)
+  %t2 = sub i4 %z, %t1
+  ret i4 %t2
+}
+
+; `insertelement` is negatible if both source vector and element-to-be-inserted are negatible.
+define <2 x i4> @negate_insertelement(<2 x i4> %src, i4 %a, i32 %x, <2 x i4> %b) {
+; CHECK-LABEL: @negate_insertelement(
+; CHECK-NEXT:    [[T2_NEG:%.*]] = insertelement <2 x i4> [[SRC:%.*]], i4 [[A:%.*]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = add <2 x i4> [[T2_NEG]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i4> [[T3]]
+;
+  %t0 = sub <2 x i4> zeroinitializer, %src
+  %t1 = sub i4 zeroinitializer, %a
+  %t2 = insertelement <2 x i4> %t0, i4 %t1, i32 %x
+  %t3 = sub <2 x i4> %b, %t2
+  ret <2 x i4> %t3
+}
+define <2 x i4> @negate_insertelement_extrause(<2 x i4> %src, i4 %a, i32 %x, <2 x i4> %b) {
+; CHECK-LABEL: @negate_insertelement_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub <2 x i4> zeroinitializer, [[SRC:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i4 0, [[A:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x i4> [[T0]], i4 [[T1]], i32 [[X:%.*]]
+; CHECK-NEXT:    call void @use_v2i4(<2 x i4> [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = sub <2 x i4> [[B:%.*]], [[T2]]
+; CHECK-NEXT:    ret <2 x i4> [[T3]]
+;
+  %t0 = sub <2 x i4> zeroinitializer, %src
+  %t1 = sub i4 zeroinitializer, %a
+  %t2 = insertelement <2 x i4> %t0, i4 %t1, i32 %x
+  call void @use_v2i4(<2 x i4> %t2)
+  %t3 = sub <2 x i4> %b, %t2
+  ret <2 x i4> %t3
+}
+define <2 x i4> @negate_insertelement_nonnegatible_base(<2 x i4> %src, i4 %a, i32 %x, <2 x i4> %b) {
+; CHECK-LABEL: @negate_insertelement_nonnegatible_base(
+; CHECK-NEXT:    [[T1:%.*]] = sub i4 0, [[A:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x i4> [[SRC:%.*]], i4 [[T1]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = sub <2 x i4> [[B:%.*]], [[T2]]
+; CHECK-NEXT:    ret <2 x i4> [[T3]]
+;
+  %t1 = sub i4 zeroinitializer, %a
+  %t2 = insertelement <2 x i4> %src, i4 %t1, i32 %x
+  %t3 = sub <2 x i4> %b, %t2
+  ret <2 x i4> %t3
+}
+define <2 x i4> @negate_insertelement_nonnegatible_insert(<2 x i4> %src, i4 %a, i32 %x, <2 x i4> %b) {
+; CHECK-LABEL: @negate_insertelement_nonnegatible_insert(
+; CHECK-NEXT:    [[T0:%.*]] = sub <2 x i4> zeroinitializer, [[SRC:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x i4> [[T0]], i4 [[A:%.*]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = sub <2 x i4> [[B:%.*]], [[T2]]
+; CHECK-NEXT:    ret <2 x i4> [[T3]]
+;
+  %t0 = sub <2 x i4> zeroinitializer, %src
+  %t2 = insertelement <2 x i4> %t0, i4 %a, i32 %x
+  %t3 = sub <2 x i4> %b, %t2
+  ret <2 x i4> %t3
+}
+
+; left-shift by constant can always be negated
+define i8 @negate_left_shift_by_constant_prefer_keeping_shl(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @negate_left_shift_by_constant_prefer_keeping_shl(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = shl i8 [[Z]], 4
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 4
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @negate_left_shift_by_constant_prefer_keeping_shl_extrause(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @negate_left_shift_by_constant_prefer_keeping_shl_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = shl i8 [[T0]], 4
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 4
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @negate_left_shift_by_constant(i8 %x, i8 %y, i8 %z, i8 %k) {
+; CHECK-LABEL: @negate_left_shift_by_constant(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 [[K:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1_NEG:%.*]] = mul i8 [[T0]], -16
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 %k, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 4
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @negate_left_shift_by_constant_extrause(i8 %x, i8 %y, i8 %z, i8 %k) {
+; CHECK-LABEL: @negate_left_shift_by_constant_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 [[K:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = shl i8 [[T0]], 4
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 %k, %z
+  call void @use8(i8 %t0)
+  %t1 = shl i8 %t0, 4
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+
+; `add` with single negatible operand is still negatible
+define i8 @negate_add_with_single_negatible_operand(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_add_with_single_negatible_operand(
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 -42, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = add i8 %x, 42
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
+}
+; do so even if we are two levels deep
+define i8 @negate_add_with_single_negatible_operand_depth2(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_add_with_single_negatible_operand_depth2(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sub i8 -21, [[X:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = mul i8 [[T0_NEG]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T1_NEG]]
+;
+  %t0 = add i8 %x, 21
+  %t1 = mul i8 %t0, %y
+  %t2 = sub i8 0, %t1
+  ret i8 %t2
+}
+
+define i8 @negate_add_with_single_negatible_operand_extrause(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_add_with_single_negatible_operand_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 -42, [[X]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = add i8 %x, 42
+  call void @use8(i8 %t0)
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
+}
+; But don't do this if that means just sinking the negation.
+define i8 @negate_add_with_single_negatible_operand_non_negation(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_add_with_single_negatible_operand_non_negation(
+; CHECK-NEXT:    [[T0:%.*]] = add i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[Y:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = add i8 %x, 42
+  %t1 = sub i8 %y, %t0
+  ret i8 %t1
+}
+
+; abs/nabs can be negated
+define i8 @negate_abs(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_abs(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false)
+; CHECK-NEXT:    [[T3:%.*]] = sub i8 [[Y:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %x
+  call void @use8(i8 %t0)
+  %t1 = icmp slt i8 %x, 0
+  %t2 = select i1 %t1, i8 %t0, i8 %x, !prof !0
+  %t3 = sub i8 %y, %t2
+  ret i8 %t3
+}
+define i8 @negate_nabs(i8 %x, i8 %y) {
+; CHECK-LABEL: @negate_nabs(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false)
+; CHECK-NEXT:    [[T3:%.*]] = add i8 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T3]]
+;
+  %t0 = sub i8 0, %x
+  call void @use8(i8 %t0)
+  %t1 = icmp slt i8 %x, 0
+  %t2 = select i1 %t1, i8 %x, i8 %t0, !prof !0
+  %t3 = sub i8 %y, %t2
+  ret i8 %t3
+}
+
+; And in general, if hands of select are known to be negation of each other,
+; we can negate the select
+define i8 @negate_select_of_op_vs_negated_op(i8 %x, i8 %y, i1 %c) {
+; CHECK-LABEL: @negate_select_of_op_vs_negated_op(
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[T0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[T0]], !prof !0
+; CHECK-NEXT:    [[T2:%.*]] = add i8 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = sub i8 0, %x
+  call void @use8(i8 %t0)
+  %t1 = select i1 %c, i8 %t0, i8 %x, !prof !0
+  %t2 = sub i8 %y, %t1
+  ret i8 %t2
+}
+define i8 @dont_negate_ordinary_select(i8 %x, i8 %y, i8 %z, i1 %c) {
+; CHECK-LABEL: @dont_negate_ordinary_select(
+; CHECK-NEXT:    [[T0:%.*]] = select i1 [[C:%.*]], i8 [[X:%.*]], i8 [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[Z:%.*]], [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %t0 = select i1 %c, i8 %x, i8 %y
+  %t1 = sub i8 %z, %t0
+  ret i8 %t1
+}
+
+; Freeze is transparent as far as negation is concerned
+define i4 @negate_freeze(i4 %x, i4 %y, i4 %z) {
+; CHECK-LABEL: @negate_freeze(
+; CHECK-NEXT:    [[T0_NEG:%.*]] = sub i4 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[T1_NEG:%.*]] = freeze i4 [[T0_NEG]]
+; CHECK-NEXT:    [[T2:%.*]] = add i4 [[T1_NEG]], [[Z:%.*]]
+; CHECK-NEXT:    ret i4 [[T2]]
+;
+  %t0 = sub i4 %x, %y
+  %t1 = freeze i4 %t0
+  %t2 = sub i4 %z, %t1
+  ret i4 %t2
+}
+define i4 @negate_freeze_extrause(i4 %x, i4 %y, i4 %z) {
+; CHECK-LABEL: @negate_freeze_extrause(
+; CHECK-NEXT:    [[T0:%.*]] = sub i4 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = freeze i4 [[T0]]
+; CHECK-NEXT:    call void @use4(i4 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i4 [[Z:%.*]], [[T1]]
+; CHECK-NEXT:    ret i4 [[T2]]
+;
+  %t0 = sub i4 %x, %y
+  %t1 = freeze i4 %t0
+  call void @use4(i4 %t1)
+  %t2 = sub i4 %z, %t1
+  ret i4 %t2
+}
+
+; Due to the InstCombine's worklist management, there are no guarantees that
+; each instruction we'll encounter has been visited by InstCombine already.
+; In particular, most importantly for us, that means we have to canonicalize
+; constants to RHS ourselves, since that is helpful sometimes.
+; This used to cause an endless combine loop.
+define void @noncanonical_mul_with_constant_as_first_operand() {
+; CHECK-LABEL: @noncanonical_mul_with_constant_as_first_operand(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[IF_END]]
+;
+entry:
+  br label %if.end
+
+if.end:
+  %e.0 = phi i32 [ undef, %entry ], [ %div, %if.end ]
+  %conv = trunc i32 %e.0 to i16
+  %mul.i = mul nsw i16 -1, %conv
+  %conv1 = sext i16 %mul.i to i32
+  %div = sub nsw i32 0, %conv1
+  br label %if.end
+}
+
+; CHECK: !0 = !{!"branch_weights", i32 40, i32 1}
+!0 = !{!"branch_weights", i32 40, i32 1}

diff  --git a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
index 580854a432bf..1127eacd35b9 100644
--- a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
@@ -176,7 +176,7 @@ define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
 define <4 x i64> @PR45314(<4 x i64> %x) {
 ; LE-LABEL: @PR45314(
 ; LE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
-; LE-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; LE-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; LE-NEXT:    [[B:%.*]] = bitcast <8 x i32> [[S]] to <4 x i64>
 ; LE-NEXT:    ret <4 x i64> [[B]]
 ;
@@ -189,7 +189,7 @@ define <4 x i64> @PR45314(<4 x i64> %x) {
   %e = extractelement <4 x i64> %x, i32 0
   %t = trunc i64 %e to i32
   %i = insertelement <8 x i32> poison, i32 %t, i32 0
-  %s = shufflevector <8 x i32> %i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %s = shufflevector <8 x i32> %i, <8 x i32> poison, <8 x i32> zeroinitializer
   %b = bitcast <8 x i32> %s to <4 x i64>
   ret <4 x i64> %b
 }

diff  --git a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
new file mode 100644
index 000000000000..ed68aeafd4c5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
@@ -0,0 +1,1023 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; Instcombine should be able to eliminate all of these ext casts.
+
+declare void @use(i32)
+declare void @use_vec(<2 x i32>)
+
+define i64 @test1(i64 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = and i64 [[A]], 15
+; CHECK-NEXT:    call void @use(i32 [[B]])
+; CHECK-NEXT:    ret i64 [[C]]
+;
+  %b = trunc i64 %a to i32
+  %c = and i32 %b, 15
+  %d = zext i32 %c to i64
+  call void @use(i32 %b)
+  ret i64 %d
+}
+
+define <2 x i64> @test1_vec(<2 x i64> %a) {
+; CHECK-LABEL: @test1_vec(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 15>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = and <2 x i32> %b, <i32 15, i32 15>
+  %d = zext <2 x i32> %c to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @test1_vec_nonuniform(<2 x i64> %a) {
+; CHECK-LABEL: @test1_vec_nonuniform(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 7>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = and <2 x i32> %b, <i32 15, i32 7>
+  %d = zext <2 x i32> %c to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @test1_vec_undef(<2 x i64> %a) {
+; CHECK-LABEL: @test1_vec_undef(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 0>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = and <2 x i32> %b, <i32 15, i32 undef>
+  %d = zext <2 x i32> %c to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define i64 @test2(i64 %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; CHECK-NEXT:    [[D1:%.*]] = shl i64 [[A]], 36
+; CHECK-NEXT:    [[D:%.*]] = ashr exact i64 [[D1]], 36
+; CHECK-NEXT:    call void @use(i32 [[B]])
+; CHECK-NEXT:    ret i64 [[D]]
+;
+  %b = trunc i64 %a to i32
+  %c = shl i32 %b, 4
+  %q = ashr i32 %c, 4
+  %d = sext i32 %q to i64
+  call void @use(i32 %b)
+  ret i64 %d
+}
+
+define <2 x i64> @test2_vec(<2 x i64> %a) {
+; CHECK-LABEL: @test2_vec(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 36>
+; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i64> [[D1]], <i64 36, i64 36>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = shl <2 x i32> %b, <i32 4, i32 4>
+  %q = ashr <2 x i32> %c, <i32 4, i32 4>
+  %d = sext <2 x i32> %q to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @test2_vec_nonuniform(<2 x i64> %a) {
+; CHECK-LABEL: @test2_vec_nonuniform(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 37>
+; CHECK-NEXT:    [[D:%.*]] = ashr <2 x i64> [[D1]], <i64 36, i64 37>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = shl <2 x i32> %b, <i32 4, i32 5>
+  %q = ashr <2 x i32> %c, <i32 4, i32 5>
+  %d = sext <2 x i32> %q to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @test2_vec_undef(<2 x i64> %a) {
+; CHECK-LABEL: @test2_vec_undef(
+; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 undef>
+; CHECK-NEXT:    [[D:%.*]] = ashr <2 x i64> [[D1]], <i64 36, i64 undef>
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %b = trunc <2 x i64> %a to <2 x i32>
+  %c = shl <2 x i32> %b, <i32 4, i32 undef>
+  %q = ashr <2 x i32> %c, <i32 4, i32 undef>
+  %d = sext <2 x i32> %q to <2 x i64>
+  call void @use_vec(<2 x i32> %b)
+  ret <2 x i64> %d
+}
+
+define i64 @test3(i64 %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = and i64 [[A]], 8
+; CHECK-NEXT:    call void @use(i32 [[B]])
+; CHECK-NEXT:    ret i64 [[C]]
+;
+  %b = trunc i64 %a to i32
+  %c = and i32 %b, 8
+  %d = zext i32 %c to i64
+  call void @use(i32 %b)
+  ret i64 %d
+}
+
+define i64 @test4(i64 %a) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = and i64 [[A]], 8
+; CHECK-NEXT:    [[X:%.*]] = xor i64 [[C]], 8
+; CHECK-NEXT:    call void @use(i32 [[B]])
+; CHECK-NEXT:    ret i64 [[X]]
+;
+  %b = trunc i64 %a to i32
+  %c = and i32 %b, 8
+  %x = xor i32 %c, 8
+  %d = zext i32 %x to i64
+  call void @use(i32 %b)
+  ret i64 %d
+}
+
+define i32 @test5(i32 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[A:%.*]], 16
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %B = zext i32 %A to i128
+  %C = lshr i128 %B, 16
+  %D = trunc i128 %C to i32
+  ret i32 %D
+}
+
+define i32 @test6(i64 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[A:%.*]], 32
+; CHECK-NEXT:    [[D:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %B = zext i64 %A to i128
+  %C = lshr i128 %B, 32
+  %D = trunc i128 %C to i32
+  ret i32 %D
+}
+
+; Test case where 'ashr' demanded bits does not contain any of the high bits,
+; but does contain sign bits, where the sign bit is not known to be zero.
+define i16 @ashr_mul_sign_bits(i8 %X, i8 %Y) {
+; CHECK-LABEL: @ashr_mul_sign_bits(
+; CHECK-NEXT:    [[A:%.*]] = sext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[B:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = mul nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = ashr i16 [[C]], 3
+; CHECK-NEXT:    ret i16 [[D]]
+;
+  %A = sext i8 %X to i32
+  %B = sext i8 %Y to i32
+  %C = mul i32 %A, %B
+  %D = ashr i32 %C, 3
+  %E = trunc i32 %D to i16
+  ret i16 %E
+}
+
+define i16 @ashr_mul(i8 %X, i8 %Y) {
+; CHECK-LABEL: @ashr_mul(
+; CHECK-NEXT:    [[A:%.*]] = sext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[B:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = mul nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = ashr i16 [[C]], 8
+; CHECK-NEXT:    ret i16 [[D]]
+;
+  %A = sext i8 %X to i20
+  %B = sext i8 %Y to i20
+  %C = mul i20 %A, %B
+  %D = ashr i20 %C, 8
+  %E = trunc i20 %D to i16
+  ret i16 %E
+}
+
+define i32 @trunc_ashr(i32 %X) {
+; CHECK-LABEL: @trunc_ashr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8
+; CHECK-NEXT:    [[C:%.*]] = or i32 [[TMP1]], -8388608
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = zext i32 %X to i36
+  %B = or i36 %A, -2147483648 ; 0xF80000000
+  %C = ashr i36 %B, 8
+  %T = trunc i36 %C to i32
+  ret i32  %T
+}
+
+define <2 x i32> @trunc_ashr_vec(<2 x i32> %X) {
+; CHECK-LABEL: @trunc_ashr_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], <i32 8, i32 8>
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i32> [[TMP1]], <i32 -8388608, i32 -8388608>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
+;
+  %A = zext <2 x i32> %X to <2 x i36>
+  %B = or <2 x i36> %A, <i36 -2147483648, i36 -2147483648> ; 0xF80000000
+  %C = ashr <2 x i36> %B, <i36 8, i36 8>
+  %T = trunc <2 x i36> %C to <2 x i32>
+  ret <2 x i32>  %T
+}
+
+define i92 @test7(i64 %A) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[A:%.*]], 32
+; CHECK-NEXT:    [[D:%.*]] = zext i64 [[TMP1]] to i92
+; CHECK-NEXT:    ret i92 [[D]]
+;
+  %B = zext i64 %A to i128
+  %C = lshr i128 %B, 32
+  %D = trunc i128 %C to i92
+  ret i92 %D
+}
+
+define i64 @test8(i32 %A, i32 %B) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[C:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[D:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[E:%.*]] = shl nuw i64 [[D]], 32
+; CHECK-NEXT:    [[F:%.*]] = or i64 [[E]], [[C]]
+; CHECK-NEXT:    ret i64 [[F]]
+;
+  %C = zext i32 %A to i128
+  %D = zext i32 %B to i128
+  %E = shl i128 %D, 32
+  %F = or i128 %E, %C
+  %G = trunc i128 %F to i64
+  ret i64 %G
+}
+
+define <2 x i64> @test8_vec(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test8_vec(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[E:%.*]] = shl nuw <2 x i64> [[D]], <i64 32, i64 32>
+; CHECK-NEXT:    [[F:%.*]] = or <2 x i64> [[E]], [[C]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = shl <2 x i128> %D, <i128 32, i128 32>
+  %F = or <2 x i128> %E, %C
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test8_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test8_vec_nonuniform(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[E:%.*]] = shl <2 x i64> [[D]], <i64 32, i64 48>
+; CHECK-NEXT:    [[F:%.*]] = or <2 x i64> [[E]], [[C]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = shl <2 x i128> %D, <i128 32, i128 48>
+  %F = or <2 x i128> %E, %C
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test8_vec_undef(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test8_vec_undef(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[E:%.*]] = shl <2 x i128> [[D]], <i128 32, i128 undef>
+; CHECK-NEXT:    [[F:%.*]] = or <2 x i128> [[E]], [[C]]
+; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[G]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = shl <2 x i128> %D, <i128 32, i128 undef>
+  %F = or <2 x i128> %E, %C
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define i8 @test9(i32 %X) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[Z:%.*]] = and i8 [[TMP1]], 42
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+  %Y = and i32 %X, 42
+  %Z = trunc i32 %Y to i8
+  ret i8 %Z
+}
+
+; rdar://8808586
+define i8 @test10(i32 %X) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[Z:%.*]] = and i8 [[Y]], 42
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+  %Y = trunc i32 %X to i8
+  %Z = and i8 %Y, 42
+  ret i8 %Z
+}
+
+define i64 @test11(i32 %A, i32 %B) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[C:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[F:%.*]] = shl i64 [[C]], [[E]]
+; CHECK-NEXT:    ret i64 [[F]]
+;
+  %C = zext i32 %A to i128
+  %D = zext i32 %B to i128
+  %E = and i128 %D, 31
+  %F = shl i128 %C, %E
+  %G = trunc i128 %F to i64
+  ret i64 %G
+}
+
+define <2 x i64> @test11_vec(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test11_vec(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = shl <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 31>
+  %F = shl <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test11_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test11_vec_nonuniform(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 15>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = shl <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 15>
+  %F = shl <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test11_vec_undef(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test11_vec_undef(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
+; CHECK-NEXT:    [[F:%.*]] = shl <2 x i128> [[C]], [[E]]
+; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[G]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %F = shl <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define i64 @test12(i32 %A, i32 %B) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[C:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[F:%.*]] = lshr i64 [[C]], [[E]]
+; CHECK-NEXT:    ret i64 [[F]]
+;
+  %C = zext i32 %A to i128
+  %D = zext i32 %B to i128
+  %E = and i128 %D, 31
+  %F = lshr i128 %C, %E
+  %G = trunc i128 %F to i64
+  ret i64 %G
+}
+
+define <2 x i64> @test12_vec(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test12_vec(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = lshr <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 31>
+  %F = lshr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test12_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test12_vec_nonuniform(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 15>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = lshr <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 15>
+  %F = lshr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test12_vec_undef(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test12_vec_undef(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
+; CHECK-NEXT:    [[F:%.*]] = lshr <2 x i128> [[C]], [[E]]
+; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[G]]
+;
+  %C = zext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %F = lshr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define i64 @test13(i32 %A, i32 %B) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[C:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[B:%.*]], 31
+; CHECK-NEXT:    [[E:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[F:%.*]] = ashr i64 [[C]], [[E]]
+; CHECK-NEXT:    ret i64 [[F]]
+;
+  %C = sext i32 %A to i128
+  %D = zext i32 %B to i128
+  %E = and i128 %D, 31
+  %F = ashr i128 %C, %E
+  %G = trunc i128 %F to i64
+  ret i64 %G
+}
+
+define <2 x i64> @test13_vec(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test13_vec(
+; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 31>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = ashr <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = sext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 31>
+  %F = ashr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test13_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test13_vec_nonuniform(
+; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 15>
+; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[F:%.*]] = ashr <2 x i64> [[C]], [[E]]
+; CHECK-NEXT:    ret <2 x i64> [[F]]
+;
+  %C = sext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 15>
+  %F = ashr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+define <2 x i64> @test13_vec_undef(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test13_vec_undef(
+; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
+; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
+; CHECK-NEXT:    [[F:%.*]] = ashr <2 x i128> [[C]], [[E]]
+; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[G]]
+;
+  %C = sext <2 x i32> %A to <2 x i128>
+  %D = zext <2 x i32> %B to <2 x i128>
+  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %F = ashr <2 x i128> %C, %E
+  %G = trunc <2 x i128> %F to <2 x i64>
+  ret <2 x i64> %G
+}
+
+; PR25543
+; https://llvm.org/bugs/show_bug.cgi?id=25543
+; This is an extractelement.
+
+define i32 @trunc_bitcast1(<4 x i32> %v) {
+; CHECK-LABEL: @trunc_bitcast1(
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 1
+; CHECK-NEXT:    ret i32 [[EXT]]
+;
+  %bc = bitcast <4 x i32> %v to i128
+  %shr = lshr i128 %bc, 32
+  %ext = trunc i128 %shr to i32
+  ret i32 %ext
+}
+
+; A bitcast may still be required.
+
+define i32 @trunc_bitcast2(<2 x i64> %v) {
+; CHECK-LABEL: @trunc_bitcast2(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[BC1]], i32 2
+; CHECK-NEXT:    ret i32 [[EXT]]
+;
+  %bc = bitcast <2 x i64> %v to i128
+  %shr = lshr i128 %bc, 64
+  %ext = trunc i128 %shr to i32
+  ret i32 %ext
+}
+
+; The right shift is optional.
+
+define i32 @trunc_bitcast3(<4 x i32> %v) {
+; CHECK-LABEL: @trunc_bitcast3(
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 0
+; CHECK-NEXT:    ret i32 [[EXT]]
+;
+  %bc = bitcast <4 x i32> %v to i128
+  %ext = trunc i128 %bc to i32
+  ret i32 %ext
+}
+
+define i32 @trunc_shl_31_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_31_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_nsw_31_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_nsw_31_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl nsw i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_nuw_31_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_nuw_31_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl nuw i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_nsw_nuw_31_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_nsw_nuw_31_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl nsw nuw i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i16 @trunc_shl_15_i16_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_15_i16_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i16 [[VAL_TR]], 15
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %shl = shl i64 %val, 15
+  %trunc = trunc i64 %shl to i16
+  ret i16 %trunc
+}
+
+define i16 @trunc_shl_15_i16_i32(i32 %val) {
+; CHECK-LABEL: @trunc_shl_15_i16_i32(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i32 [[VAL:%.*]] to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i16 [[VAL_TR]], 15
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %shl = shl i32 %val, 15
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+define i8 @trunc_shl_7_i8_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_7_i8_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i8
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i8 [[VAL_TR]], 7
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+  %shl = shl i64 %val, 7
+  %trunc = trunc i64 %shl to i8
+  ret i8 %trunc
+}
+
+define i2 @trunc_shl_1_i2_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_1_i2_i64(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[VAL:%.*]], 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHL]] to i2
+; CHECK-NEXT:    ret i2 [[TRUNC]]
+;
+  %shl = shl i64 %val, 1
+  %trunc = trunc i64 %shl to i2
+  ret i2 %trunc
+}
+
+define i32 @trunc_shl_1_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_1_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 1
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl i64 %val, 1
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_16_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_16_i32_i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 16
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %shl = shl i64 %val, 16
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_33_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_33_i32_i64(
+; CHECK-NEXT:    ret i32 0
+;
+  %shl = shl i64 %val, 33
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_shl_32_i32_i64(i64 %val) {
+; CHECK-LABEL: @trunc_shl_32_i32_i64(
+; CHECK-NEXT:    ret i32 0
+;
+  %shl = shl i64 %val, 32
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+; Should be able to handle vectors
+define <2 x i32> @trunc_shl_16_v2i32_v2i64(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_16_v2i32_v2i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 16, i32 16>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 16, i64 16>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define <2 x i32> @trunc_shl_nosplat_v2i32_v2i64(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_nosplat_v2i32_v2i64(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 15, i32 16>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 15, i64 16>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define void @trunc_shl_31_i32_i64_multi_use(i64 %val, i32 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1) {
+; CHECK-LABEL: @trunc_shl_31_i32_i64_multi_use(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[VAL:%.*]], 31
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT:    store volatile i32 [[TRUNC]], i32 addrspace(1)* [[PTR0:%.*]], align 4
+; CHECK-NEXT:    store volatile i64 [[SHL]], i64 addrspace(1)* [[PTR1:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %shl = shl i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  store volatile i32 %trunc, i32 addrspace(1)* %ptr0
+  store volatile i64 %shl, i64 addrspace(1)* %ptr1
+  ret void
+}
+
+define i32 @trunc_shl_lshr_infloop(i64 %arg) {
+; CHECK-LABEL: @trunc_shl_lshr_infloop(
+; CHECK-NEXT:    [[ARG_TR:%.*]] = trunc i64 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[ARG_TR]], 1
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[TMP1]], -4
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = lshr i64 %arg, 1
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define <2 x i32> @trunc_shl_v2i32_v2i64_uniform(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_uniform(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 31>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 31, i64 31>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define <2 x i32> @trunc_shl_v2i32_v2i64_undef(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_undef(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 undef>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 31, i64 undef>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define <2 x i32> @trunc_shl_v2i32_v2i64_nonuniform(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_nonuniform(
+; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 31, i64 12>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define <2 x i32> @trunc_shl_v2i32_v2i64_outofrange(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_outofrange(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i64> [[VAL:%.*]], <i64 31, i64 33>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i64> [[SHL]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
+;
+  %shl = shl <2 x i64> %val, <i64 31, i64 33>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
+define i32 @trunc_shl_ashr_infloop(i64 %arg) {
+; CHECK-LABEL: @trunc_shl_ashr_infloop(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[ARG:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[TMP2]], -4
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = ashr i64 %arg, 3
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define i32 @trunc_shl_shl_infloop(i64 %arg) {
+; CHECK-LABEL: @trunc_shl_shl_infloop(
+; CHECK-NEXT:    [[ARG_TR:%.*]] = trunc i64 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[ARG_TR]], 3
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = shl i64 %arg, 1
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define i32 @trunc_shl_lshr_var(i64 %arg, i64 %val) {
+; CHECK-LABEL: @trunc_shl_lshr_var(
+; CHECK-NEXT:    [[A:%.*]] = lshr i64 [[ARG:%.*]], [[VAL:%.*]]
+; CHECK-NEXT:    [[A_TR:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[A_TR]], 2
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = lshr i64 %arg, %val
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define i32 @trunc_shl_ashr_var(i64 %arg, i64 %val) {
+; CHECK-LABEL: @trunc_shl_ashr_var(
+; CHECK-NEXT:    [[A:%.*]] = ashr i64 [[ARG:%.*]], [[VAL:%.*]]
+; CHECK-NEXT:    [[A_TR:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[A_TR]], 2
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = ashr i64 %arg, %val
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define i32 @trunc_shl_shl_var(i64 %arg, i64 %val) {
+; CHECK-LABEL: @trunc_shl_shl_var(
+; CHECK-NEXT:    [[A:%.*]] = shl i64 [[ARG:%.*]], [[VAL:%.*]]
+; CHECK-NEXT:    [[A_TR:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[A_TR]], 2
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = shl i64 %arg, %val
+  %B = shl i64 %A, 2
+  %C = trunc i64 %B to i32
+  ret i32 %C
+}
+
+define <8 x i16> @trunc_shl_v8i15_v8i32_15(<8 x i32> %a) {
+; CHECK-LABEL: @trunc_shl_v8i15_v8i32_15(
+; CHECK-NEXT:    [[A_TR:%.*]] = trunc <8 x i32> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[CONV:%.*]] = shl <8 x i16> [[A_TR]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    ret <8 x i16> [[CONV]]
+;
+  %shl = shl <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %conv = trunc <8 x i32> %shl to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @trunc_shl_v8i16_v8i32_16(<8 x i32> %a) {
+; CHECK-LABEL: @trunc_shl_v8i16_v8i32_16(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %shl = shl <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %conv = trunc <8 x i32> %shl to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @trunc_shl_v8i16_v8i32_17(<8 x i32> %a) {
+; CHECK-LABEL: @trunc_shl_v8i16_v8i32_17(
+; CHECK-NEXT:    ret <8 x i16> zeroinitializer
+;
+  %shl = shl <8 x i32> %a, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+  %conv = trunc <8 x i32> %shl to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @trunc_shl_v8i16_v8i32_4(<8 x i32> %a) {
+; CHECK-LABEL: @trunc_shl_v8i16_v8i32_4(
+; CHECK-NEXT:    [[A_TR:%.*]] = trunc <8 x i32> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[CONV:%.*]] = shl <8 x i16> [[A_TR]], <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+; CHECK-NEXT:    ret <8 x i16> [[CONV]]
+;
+  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %conv = trunc <8 x i32> %shl to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+; Although the mask is the same value, we don't create a shuffle for types that the backend may not be able to handle:
+; trunc (shuffle X, C, Mask) --> shuffle (trunc X), C', Mask
+
+define <4 x i8> @wide_shuf(<4 x i32> %x) {
+; CHECK-LABEL: @wide_shuf(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 3634, i32 90, i32 poison>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <4 x i32> [[SHUF]] to <4 x i8>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> <i32 35, i32 3634, i32 90, i32 -1>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <4 x i8> @wide_splat1(<4 x i32> %x) {
+; CHECK-LABEL: @wide_splat1(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; Test weird types.
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <3 x i31> @wide_splat2(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat2(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <3 x i33> [[X:%.*]] to <3 x i31>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> undef, <3 x i32> <i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; FIXME:
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; A mask with undef elements should still be considered a splat mask.
+
+define <3 x i31> @wide_splat3(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat3(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i33> [[X:%.*]], <3 x i33> poison, <3 x i32> <i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 undef, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; TODO: The shuffle extends the length of the input vector. Should we shrink this?
+
+define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
+; CHECK-LABEL: @wide_lengthening_splat(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8>
+; CHECK-NEXT:    ret <8 x i8> [[TR]]
+;
+  %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+  %tr = trunc <8 x i16> %shuf to <8 x i8>
+  ret <8 x i8> %tr
+}
+
+define <2 x i8> @narrow_add_vec_constant(<2 x i32> %x) {
+; CHECK-LABEL: @narrow_add_vec_constant(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[TR:%.*]] = add <2 x i8> [[TMP1]], <i8 0, i8 127>
+; CHECK-NEXT:    ret <2 x i8> [[TR]]
+;
+  %add = add <2 x i32> %x, <i32 256, i32 -129>
+  %tr = trunc <2 x i32> %add to <2 x i8>
+  ret <2 x i8> %tr
+}
+
+define <2 x i8> @narrow_mul_vec_constant(<2 x i32> %x) {
+; CHECK-LABEL: @narrow_mul_vec_constant(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[TR:%.*]] = mul <2 x i8> [[TMP1]], <i8 0, i8 127>
+; CHECK-NEXT:    ret <2 x i8> [[TR]]
+;
+  %add = mul <2 x i32> %x, <i32 256, i32 -129>
+  %tr = trunc <2 x i32> %add to <2 x i8>
+  ret <2 x i8> %tr
+}
+
+define <2 x i8> @narrow_sub_vec_constant(<2 x i32> %x) {
+; CHECK-LABEL: @narrow_sub_vec_constant(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[TR:%.*]] = sub <2 x i8> <i8 0, i8 127>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[TR]]
+;
+  %sub = sub <2 x i32> <i32 256, i32 -129>, %x
+  %tr = trunc <2 x i32> %sub to <2 x i8>
+  ret <2 x i8> %tr
+}
+
+; If the select is narrowed based on the target's datalayout, we allow more optimizations.
+
+define i16 @PR44545(i32 %t0, i32 %data) {
+; CHECK-LABEL: @PR44545(
+; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[DATA:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[T0:%.*]] to i16
+; CHECK-NEXT:    [[SUB:%.*]] = select i1 [[ISZERO]], i16 -1, i16 [[TMP1]]
+; CHECK-NEXT:    ret i16 [[SUB]]
+;
+  %t1 = add nuw nsw i32 %t0, 1
+  %iszero = icmp eq i32 %data, 0
+  %ffs = select i1 %iszero, i32 0, i32 %t1
+  %cast = trunc i32 %ffs to i16
+  %sub = add nsw i16 %cast, -1
+  ret i16 %sub
+}

diff  --git a/llvm/test/Transforms/InstCombine/type_pun-inseltpoison.ll b/llvm/test/Transforms/InstCombine/type_pun-inseltpoison.ll
new file mode 100644
index 000000000000..a0604c9c836b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/type_pun-inseltpoison.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Ensure that type punning using a union of vector and same-sized array
+; generates an extract instead of a shuffle with an uncommon vector size:
+;
+;   typedef uint32_t v4i32 __attribute__((vector_size(16)));
+;   union { v4i32 v; uint32_t a[4]; };
+;
+; This cleans up behind SROA, which inserts the uncommon vector size when
+; cleaning up the alloca/store/GEP/load.
+
+
+; Provide legal integer types.
+target datalayout = "p:32:32"
+
+
+; Extracting the zeroth element in an i32 array.
+define i32 @type_pun_zeroth(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_zeroth(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting the first element in an i32 array.
+define i32 @type_pun_first(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_first(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 1
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Extracting an i32 that isn't aligned to any natural boundary.
+define i32 @type_pun_misaligned(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_misaligned(
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = shufflevector <16 x i8> [[IN:%.*]], <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[SROA_EXTRACT]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret i32 [[SROA_EXTRACT1]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %1 = bitcast <4 x i8> %sroa to i32
+  ret i32 %1
+}
+
+; Type punning to an array of pointers.
+define i32* @type_pun_pointer(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_pointer(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i32 [[SROA_EXTRACT]] to i32*
+; CHECK-NEXT:    ret i32* [[TMP1]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to i32
+  %2 = inttoptr i32 %1 to i32*
+  ret i32* %2
+}
+
+; Type punning to an array of 32-bit floating-point values.
+define float @type_pun_float(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x float>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x float> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret float [[SROA_EXTRACT]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = bitcast <4 x i8> %sroa to float
+  ret float %1
+}
+
+; Type punning to an array of 64-bit floating-point values.
+define double @type_pun_double(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_double(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <2 x double>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <2 x double> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret double [[SROA_EXTRACT]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <8 x i8> %sroa to double
+  ret double %1
+}
+
+; Type punning to same-size floating-point and integer values.
+; Verify that multiple uses with 
diff erent bitcast types are properly handled.
+define { float, i32 } @type_pun_float_i32(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_float_i32(
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    [[SROA_BC1:%.*]] = bitcast <16 x i8> [[IN]] to <4 x float>
+; CHECK-NEXT:    [[SROA_EXTRACT2:%.*]] = extractelement <4 x float> [[SROA_BC1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { float, i32 } undef, float [[SROA_EXTRACT2]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { float, i32 } [[TMP1]], i32 [[SROA_EXTRACT]], 1
+; CHECK-NEXT:    ret { float, i32 } [[TMP2]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %f = bitcast <4 x i8> %sroa to float
+  %i = bitcast <4 x i8> %sroa to i32
+  %1 = insertvalue { float, i32 } undef, float %f, 0
+  %2 = insertvalue { float, i32 } %1, i32 %i, 1
+  ret { float, i32 } %2
+}
+
+; Type punning two i32 values, with control flow.
+; Verify that the bitcast is shared and dominates usage.
+define i32 @type_pun_i32_ctrl(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_i32_ctrl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    br i1 undef, label [[LEFT:%.*]], label [[RIGHT:%.*]]
+; CHECK:       left:
+; CHECK-NEXT:    [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    br label [[TAIL:%.*]]
+; CHECK:       right:
+; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    br label [[TAIL]]
+; CHECK:       tail:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[SROA_EXTRACT1]], [[LEFT]] ], [ [[SROA_EXTRACT]], [[RIGHT]] ]
+; CHECK-NEXT:    ret i32 [[I]]
+;
+entry:
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  br i1 undef, label %left, label %right
+left:
+  %lhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+right:
+  %rhs = bitcast <4 x i8> %sroa to i32
+  br label %tail
+tail:
+  %i = phi i32 [ %lhs, %left ], [ %rhs, %right ]
+  ret i32 %i
+}
+
+; Extracting a type that won't fit in a vector isn't handled. The function
+; should stay the same.
+define i40 @type_pun_unhandled(<16 x i8> %in) {
+; CHECK-LABEL: @type_pun_unhandled(
+; CHECK-NEXT:    [[SROA:%.*]] = shufflevector <16 x i8> [[IN:%.*]], <16 x i8> poison, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <5 x i8> [[SROA]] to i40
+; CHECK-NEXT:    ret i40 [[TMP1]]
+;
+  %sroa = shufflevector <16 x i8> %in, <16 x i8> poison, <5 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8>
+  %1 = bitcast <5 x i8> %sroa to i40
+  ret i40 %1
+}

diff  --git a/llvm/test/Transforms/InstCombine/vec-binop-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec-binop-select-inseltpoison.ll
new file mode 100644
index 000000000000..b862c90559d0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vec-binop-select-inseltpoison.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Non-canonical mask
+
+define <4 x i32> @and(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @and(
+; CHECK-NEXT:    [[R:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  %r = and <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <vscale x 4 x i32> @vscaleand(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: @vscaleand(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <vscale x 4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %sel1 = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %sel2 = shufflevector <vscale x 4 x i32> %y, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %r = and <vscale x 4 x i32> %sel1, %sel2
+  ret <vscale x 4 x i32> %r
+}
+
+define <4 x i32> @or(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @or(
+; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %r = or <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Non-canonical masks
+
+define <4 x i32> @xor(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @xor(
+; CHECK-NEXT:    [[R:%.*]] = xor <4 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = xor <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Flags
+
+define <4 x i32> @add(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:    [[R:%.*]] = add nsw <4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %r = add nsw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Negative test - wrong operand
+
+define <4 x i32> @add_wrong_op(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: @add_wrong_op(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %z, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %r = add nsw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Negative test - wrong mask (but we could handle this...)
+
+define <4 x i32> @add_non_select_mask(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @add_non_select_mask(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 1, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 1, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
+  %r = add nsw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Negative test - wrong mask (but we could handle this...)
+
+define <4 x i32> @add_masks_with_undefs(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @add_masks_with_undefs(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 undef, i32 5, i32 2, i32 7>
+  %r = add nsw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+; Non-commutative opcode
+
+define <4 x i32> @sub(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @sub(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = sub <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %r = sub <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @mul(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:    [[R:%.*]] = mul nuw <4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %r = mul nuw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @sdiv(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @sdiv(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sdiv <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %r = sdiv <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @udiv(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @udiv(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = udiv <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %r = udiv <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @srem(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @srem(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = srem <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %r = srem <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @urem(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @urem(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = urem <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  %r = urem <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shl(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = shl nsw <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %r = shl nsw <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @lshr(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @lshr(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = lshr exact <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %r = lshr exact <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @ashr(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @ashr(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Y]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = ashr <4 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %sel1 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = ashr <4 x i32> %sel1, %sel2
+  ret <4 x i32> %r
+}
+
+define <4 x float> @fadd(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %sel1 = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = fadd <4 x float> %sel1, %sel2
+  ret <4 x float> %r
+}
+
+define <4 x float> @fsub(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @fsub(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[X:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub fast <4 x float> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %sel1 = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = fsub fast <4 x float> %sel1, %sel2
+  ret <4 x float> %r
+}
+
+define <4 x double> @fmul(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: @fmul(
+; CHECK-NEXT:    [[R:%.*]] = fmul nnan <4 x double> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %sel1 = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x double> %y, <4 x double> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = fmul nnan <4 x double> %sel1, %sel2
+  ret <4 x double> %r
+}
+
+define <4 x double> @fdiv(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: @fdiv(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[X:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fdiv nnan arcp <4 x double> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %sel1 = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x double> %y, <4 x double> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = fdiv arcp nnan <4 x double> %sel1, %sel2
+  ret <4 x double> %r
+}
+
+define <4 x double> @frem(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: @frem(
+; CHECK-NEXT:    [[SEL1:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[X:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[SEL2:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = frem <4 x double> [[SEL1]], [[SEL2]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %sel1 = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %sel2 = shufflevector <4 x double> %y, <4 x double> %x, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %r = frem <4 x double> %sel1, %sel2
+  ret <4 x double> %r
+}

diff  --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
index 080324ead116..8bcdf0137ec9 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
@@ -84,7 +84,7 @@ define <2 x float> @test_fptrunc(double %f) {
   %t11 = insertelement <4 x double> %t10, double 0.000000e+00, i32 2
   %t12 = insertelement <4 x double> %t11, double 0.000000e+00, i32 3
   %t5 = fptrunc <4 x double> %t12 to <4 x float>
-  %ret = shufflevector <4 x float> %t5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %ret = shufflevector <4 x float> %t5, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %ret
 }
 
@@ -99,7 +99,7 @@ define <2 x double> @test_fpext(float %f) {
   %t11 = insertelement <4 x float> %t10, float 0.000000e+00, i32 2
   %t12 = insertelement <4 x float> %t11, float 0.000000e+00, i32 3
   %t5 = fpext <4 x float> %t12 to <4 x double>
-  %ret = shufflevector <4 x double> %t5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %ret = shufflevector <4 x double> %t5, <4 x double> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %ret
 }
 
@@ -150,7 +150,7 @@ define <4 x float> @inselt_shuf_no_demand(float %a1, float %a2, float %a3) {
   %out1 = insertelement <4 x float> poison, float %a1, i32 1
   %out12 = insertelement <4 x float> %out1, float %a2, i32 2
   %out123 = insertelement <4 x float> %out12, float %a3, i32 3
-  %shuffle = shufflevector <4 x float> %out123, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %out123, <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   ret <4 x float> %shuffle
 }
 
@@ -175,7 +175,7 @@ define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b)
 ; CHECK-NEXT:    [[OUT0:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
 ; CHECK-NEXT:    [[OUT01:%.*]] = insertelement <4 x i32> [[OUT0]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[FOO:%.*]] = add <4 x i32> [[OUT01]], [[B:%.*]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[FOO]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[FOO]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %out0 = insertelement <4 x i32> poison, i32 %a0, i32 0
@@ -183,20 +183,20 @@ define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b)
   %out012 = insertelement <4 x i32> %out01, i32 %a0, i32 2
   %foo = add <4 x i32> %out012, %b
   %out0123 = insertelement <4 x i32> %foo, i32 %a1, i32 3
-  %shuffle = shufflevector <4 x i32> %out0123, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %out0123, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %shuffle
 }
 
 define <4 x float> @inselt_shuf_no_demand_bogus_insert_index_in_chain(float %a1, float %a2, float %a3, i32 %variable_index) {
 ; CHECK-LABEL: @inselt_shuf_no_demand_bogus_insert_index_in_chain(
 ; CHECK-NEXT:    [[OUT12:%.*]] = insertelement <4 x float> poison, float [[A2:%.*]], i32 [[VARIABLE_INDEX:%.*]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[OUT12]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[OUT12]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
 ;
   %out1 = insertelement <4 x float> poison, float %a1, i32 1
   %out12 = insertelement <4 x float> %out1, float %a2, i32 %variable_index ; something unexpected
   %out123 = insertelement <4 x float> %out12, float %a3, i32 3
-  %shuffle = shufflevector <4 x float> %out123, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %out123, <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   ret <4 x float> %shuffle
 }
 
@@ -205,297 +205,297 @@ define <4 x float> @inselt_shuf_no_demand_bogus_insert_index_in_chain(float %a1,
 define <3 x i8> @shuf_add(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_add(
 ; CHECK-NEXT:    [[BO:%.*]] = add <3 x i8> [[X:%.*]], <i8 poison, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 2>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = add nsw <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 2>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_sub(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_sub(
 ; CHECK-NEXT:    [[BO:%.*]] = sub <3 x i8> <i8 1, i8 poison, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 2>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = sub nuw <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 2>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 2>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_mul(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_mul(
 ; CHECK-NEXT:    [[BO:%.*]] = mul <3 x i8> [[X:%.*]], <i8 1, i8 poison, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 0, i32 2, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = mul nsw <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 0, i32 2, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_and(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_and(
 ; CHECK-NEXT:    [[BO:%.*]] = and <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = and <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 1, i32 1, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_or(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_or(
 ; CHECK-NEXT:    [[BO:%.*]] = or <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = or <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_xor(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_xor(
 ; CHECK-NEXT:    [[BO:%.*]] = xor <3 x i8> [[X:%.*]], <i8 1, i8 poison, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = xor <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_lshr_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_lshr_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = lshr <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = lshr <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_lshr_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_lshr_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = lshr exact <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = lshr exact <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_ashr_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_ashr_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = lshr <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = ashr <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_ashr_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_ashr_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = ashr exact <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = ashr exact <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_shl_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_shl_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = shl nsw <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = shl nsw <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_shl_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_shl_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = shl nuw <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_sdiv_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_sdiv_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = sdiv exact <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = sdiv exact <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 0, i32 undef, i32 1>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_sdiv_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_sdiv_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = sdiv <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = sdiv <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_srem_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_srem_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = srem <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 2>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = srem <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 1, i32 undef, i32 2>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_srem_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_srem_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = srem <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 1>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = srem <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 1>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 1>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_udiv_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_udiv_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv exact <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = udiv exact <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_udiv_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_udiv_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = udiv <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 undef, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_urem_const_op0(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_urem_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = urem <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = urem <3 x i8> <i8 1, i8 2, i8 3>, %x
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 2, i32 1, i32 undef>
   ret <3 x i8> %r
 }
 
 define <3 x i8> @shuf_urem_const_op1(<3 x i8> %x) {
 ; CHECK-LABEL: @shuf_urem_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = urem <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> poison, <3 x i32> <i32 undef, i32 1, i32 0>
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
   %bo = urem <3 x i8> %x, <i8 1, i8 2, i8 3>
-  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> poison, <3 x i32> <i32 undef, i32 1, i32 0>
   ret <3 x i8> %r
 }
 
 define <3 x float> @shuf_fadd(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_fadd(
 ; CHECK-NEXT:    [[BO:%.*]] = fadd <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = fadd <3 x float> %x, <float 1.0, float 2.0, float 3.0>
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_fsub(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_fsub(
 ; CHECK-NEXT:    [[BO:%.*]] = fsub fast <3 x float> <float 1.000000e+00, float poison, float 3.000000e+00>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 0, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = fsub fast <3 x float> <float 1.0, float 2.0, float 3.0>, %x
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 0, i32 2>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_fmul(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_fmul(
 ; CHECK-NEXT:    [[BO:%.*]] = fmul reassoc <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = fmul reassoc <3 x float> %x, <float 1.0, float 2.0, float 3.0>
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_fdiv_const_op0(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_fdiv_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = fdiv reassoc ninf <3 x float> <float 1.000000e+00, float poison, float 3.000000e+00>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 0, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = fdiv ninf reassoc <3 x float> <float 1.0, float 2.0, float 3.0>, %x
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 0, i32 2>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_fdiv_const_op1(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_fdiv_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = fdiv nnan ninf <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = fdiv ninf nnan <3 x float> %x, <float 1.0, float 2.0, float 3.0>
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 1, i32 0>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_frem_const_op0(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_frem_const_op0(
 ; CHECK-NEXT:    [[BO:%.*]] = frem nnan <3 x float> <float 1.000000e+00, float poison, float 3.000000e+00>, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 2, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 undef, i32 2, i32 0>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = frem nnan <3 x float> <float 1.0, float 2.0, float 3.0>, %x
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 2, i32 0>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 undef, i32 2, i32 0>
   ret <3 x float> %r
 }
 
 define <3 x float> @shuf_frem_const_op1(<3 x float> %x) {
 ; CHECK-LABEL: @shuf_frem_const_op1(
 ; CHECK-NEXT:    [[BO:%.*]] = frem reassoc ninf <3 x float> [[X:%.*]], <float poison, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> poison, <3 x i32> <i32 1, i32 undef, i32 2>
 ; CHECK-NEXT:    ret <3 x float> [[R]]
 ;
   %bo = frem ninf reassoc <3 x float> %x, <float 1.0, float 2.0, float 3.0>
-  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  %r = shufflevector <3 x float> %bo, <3 x float> poison, <3 x i32> <i32 1, i32 undef, i32 2>
   ret <3 x float> %r
 }
 
@@ -515,13 +515,13 @@ define i32* @gep_vbase_w_s_idx(<2 x i32*> %base) {
 
 define i32* @gep_splat_base_w_s_idx(i32* %base) {
 ; CHECK-LABEL: @gep_splat_base_w_s_idx(
-; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> poison, i32* [[BASE:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], i64 1
 ; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
 ; CHECK-NEXT:    ret i32* [[EE]]
 ;
   %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
-  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> poison, <2 x i32> zeroinitializer
   %gep = getelementptr i32, <2 x i32*> %basevec2, i64 1
   %ee = extractelement <2 x i32*> %gep, i32 1
   ret i32* %ee
@@ -530,13 +530,13 @@ define i32* @gep_splat_base_w_s_idx(i32* %base) {
 
 define i32* @gep_splat_base_w_cv_idx(i32* %base) {
 ; CHECK-LABEL: @gep_splat_base_w_cv_idx(
-; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> poison, i32* [[BASE:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> <i64 poison, i64 1>
 ; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
 ; CHECK-NEXT:    ret i32* [[EE]]
 ;
   %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
-  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> poison, <2 x i32> zeroinitializer
   %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> <i64 0, i64 1>
   %ee = extractelement <2 x i32*> %gep, i32 1
   ret i32* %ee
@@ -544,13 +544,13 @@ define i32* @gep_splat_base_w_cv_idx(i32* %base) {
 
 define i32* @gep_splat_base_w_vidx(i32* %base, <2 x i64> %idxvec) {
 ; CHECK-LABEL: @gep_splat_base_w_vidx(
-; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> poison, i32* [[BASE:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> [[IDXVEC:%.*]]
 ; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
 ; CHECK-NEXT:    ret i32* [[EE]]
 ;
   %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
-  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> poison, <2 x i32> zeroinitializer
   %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> %idxvec
   %ee = extractelement <2 x i32*> %gep, i32 1
   ret i32* %ee
@@ -593,29 +593,29 @@ define i32* @gep_sbase_w_cv_idx(i32* %base) {
 
 define i32* @gep_sbase_w_splat_idx(i32* %base, i64 %idx) {
 ; CHECK-LABEL: @gep_sbase_w_splat_idx(
-; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> undef, i64 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> poison, i64 [[IDX:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <2 x i64> [[IDXVEC2]]
 ; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
 ; CHECK-NEXT:    ret i32* [[EE]]
 ;
   %idxvec1 = insertelement <2 x i64> poison, i64 %idx, i32 0
-  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> poison, <2 x i32> zeroinitializer
   %gep = getelementptr i32, i32* %base, <2 x i64> %idxvec2
   %ee = extractelement <2 x i32*> %gep, i32 1
   ret i32* %ee
 }
 define i32* @gep_splat_both(i32* %base, i64 %idx) {
 ; CHECK-LABEL: @gep_splat_both(
-; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
-; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> undef, i64 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> poison, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> poison, i64 [[IDX:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> [[IDXVEC2]]
 ; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
 ; CHECK-NEXT:    ret i32* [[EE]]
 ;
   %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
-  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> poison, <2 x i32> zeroinitializer
   %idxvec1 = insertelement <2 x i64> poison, i64 %idx, i32 0
-  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> poison, <2 x i32> zeroinitializer
   %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> %idxvec2
   %ee = extractelement <2 x i32*> %gep, i32 1
   ret i32* %ee
@@ -682,7 +682,7 @@ define <4 x i8> @select_cond_with_eq_true_false_elts(<4 x i8> %x, <4 x i8> %y, <
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
   %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  %splat = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> zeroinitializer
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> zeroinitializer
   %r = select <4 x i1> %splat, <4 x i8> %tval, <4 x i8> %y
   ret <4 x i8> %r
 }
@@ -691,13 +691,13 @@ define <4 x i8> @select_cond_with_eq_true_false_elts(<4 x i8> %x, <4 x i8> %y, <
 
 define <4 x i8> @select_cond_with_eq_true_false_elts2(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
 ; CHECK-LABEL: @select_cond_with_eq_true_false_elts2(
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> poison, <4 x i32> <i32 undef, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[SEL]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
   %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
   ret <4 x i8> %r
 }
@@ -709,13 +709,13 @@ define <4 x float> @select_cond_with_eq_true_false_elts3(<4 x float> %x, <4 x fl
 ; CHECK-LABEL: @select_cond_with_eq_true_false_elts3(
 ; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
 ; CHECK-NEXT:    [[FVAL:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[X]], <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> poison, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[TVAL]], <4 x float> [[FVAL]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %tval = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
   %fval = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
-  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
   %r = select <4 x i1> %cond, <4 x float> %tval, <4 x float> %fval
   ret <4 x float> %r
 }
@@ -723,12 +723,12 @@ define <4 x float> @select_cond_with_eq_true_false_elts3(<4 x float> %x, <4 x fl
 define <4 x i8> @select_cond_with_undef_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
 ; CHECK-LABEL: @select_cond_with_undef_true_false_elts(
 ; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> poison, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[TVAL]], <4 x i8> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
   %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 undef, i32 5, i32 6, i32 7>
-  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
   ret <4 x i8> %r
 }

diff  --git a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
index e39b3e46801f..788d79262fa3 100644
--- a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
@@ -8,7 +8,7 @@ define <4 x i16*> @PR41270([4 x i16]* %x) {
 ; CHECK-NEXT:    ret <4 x i16*> [[TMP2]]
 ;
   %ins = insertelement <4 x [4 x i16]*> poison, [4 x i16]* %x, i32 0
-  %splat = shufflevector <4 x [4 x i16]*> %ins, <4 x [4 x i16]*> undef, <4 x i32> zeroinitializer
+  %splat = shufflevector <4 x [4 x i16]*> %ins, <4 x [4 x i16]*> poison, <4 x i32> zeroinitializer
   %t2 = getelementptr inbounds [4 x i16], <4 x [4 x i16]*> %splat, i32 0, i32 3
   %t3 = extractelement <4 x i16*> %t2, i32 3
   %ins2 = insertelement <4 x i16*> poison, i16* %t3, i32 0

diff  --git a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
index ba4f88d87d3b..e8c96d15bdb1 100644
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
@@ -6,7 +6,7 @@ define void @f(i64 %val, i32  %limit, i32 *%ptr) {
 ; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
-  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
   %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %1 = trunc <16 x i64> %0 to <16 x i32>
   br label %loop
@@ -32,7 +32,7 @@ define void @copy(i64 %val, i32  %limit, i32 *%ptr) {
 ; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
-  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
   %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %1 = trunc <16 x i64> %0 to <16 x i32>
   br label %loop
@@ -59,7 +59,7 @@ define void @nocopy(i64 %val, i32  %limit, i32 *%ptr) {
 ; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ]
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
-  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
   %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %1 = trunc <16 x i64> %0 to <16 x i32>
   br label %loop

diff  --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
index 27ed637963b5..3a2509722114 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
@@ -5,7 +5,7 @@ define <4 x float> @test1(<4 x float> %v1) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    ret <4 x float> [[V1:%.*]]
 ;
-  %v2 = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2 = shufflevector <4 x float> %v1, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %v2
 }
 
@@ -32,7 +32,7 @@ define i32 @test4(<4 x i32> %X) {
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %t = shufflevector <4 x i32> %X, <4 x i32> undef, <4 x i32> zeroinitializer
+  %t = shufflevector <4 x i32> %X, <4 x i32> poison, <4 x i32> zeroinitializer
   %r = extractelement <4 x i32> %t, i32 0
   ret i32 %r
 }
@@ -42,7 +42,7 @@ define i32 @test5(<4 x i32> %X) {
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %t = shufflevector <4 x i32> %X, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
+  %t = shufflevector <4 x i32> %X, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
   %r = extractelement <4 x i32> %t, i32 0
   ret i32 %r
 }
@@ -53,7 +53,7 @@ define float @test6(<4 x float> %X) {
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %X1 = bitcast <4 x float> %X to <4 x i32>
-  %t = shufflevector <4 x i32> %X1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %t = shufflevector <4 x i32> %X1, <4 x i32> poison, <4 x i32> zeroinitializer
   %t2 = bitcast <4 x i32> %t to <4 x float>
   %r = extractelement <4 x float> %t2, i32 0
   ret float %r
@@ -61,12 +61,12 @@ define float @test6(<4 x float> %X) {
 
 define float @testvscale6(<vscale x 4 x float> %X) {
 ; CHECK-LABEL: @testvscale6(
-; CHECK-NEXT:    [[T2:%.*]] = shufflevector <vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[T2:%.*]] = shufflevector <vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x float> [[T2]], i32 0
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %X1 = bitcast <vscale x 4 x float> %X to <vscale x 4 x i32>
-  %t = shufflevector <vscale x 4 x i32> %X1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %t = shufflevector <vscale x 4 x i32> %X1, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %t2 = bitcast <vscale x 4 x i32> %t to <vscale x 4 x float>
   %r = extractelement <vscale x 4 x float> %t2, i32 0
   ret float %r
@@ -75,10 +75,10 @@ define float @testvscale6(<vscale x 4 x float> %X) {
 
 define <4 x float> @test7(<4 x float> %x) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
-  %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >
+  %r = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >
   ret <4 x float> %r
 }
 
@@ -103,11 +103,11 @@ define <4 x float> @test8(<4 x float> %x, <4 x float> %y) {
 ; 
diff erent length then the second.
 define <4 x i8> @test9(<16 x i8> %t6) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[T9:%.*]] = shufflevector <16 x i8> [[T6:%.*]], <16 x i8> undef, <4 x i32> <i32 13, i32 9, i32 4, i32 13>
+; CHECK-NEXT:    [[T9:%.*]] = shufflevector <16 x i8> [[T6:%.*]], <16 x i8> poison, <4 x i32> <i32 13, i32 9, i32 4, i32 13>
 ; CHECK-NEXT:    ret <4 x i8> [[T9]]
 ;
-  %t7 = shufflevector <16 x i8> %t6, <16 x i8> undef, <4 x i32> < i32 13, i32 9, i32 4, i32 13 >
-  %t9 = shufflevector <4 x i8> %t7, <4 x i8> undef, <4 x i32> < i32 3, i32 1, i32 2, i32 0 >
+  %t7 = shufflevector <16 x i8> %t6, <16 x i8> poison, <4 x i32> < i32 13, i32 9, i32 4, i32 13 >
+  %t9 = shufflevector <4 x i8> %t7, <4 x i8> poison, <4 x i32> < i32 3, i32 1, i32 2, i32 0 >
   ret <4 x i8> %t9
 }
 
@@ -117,12 +117,12 @@ define <4 x i8> @test9(<16 x i8> %t6) {
 
 define <4 x i8> @test9a(<16 x i8> %t6) {
 ; CHECK-LABEL: @test9a(
-; CHECK-NEXT:    [[T7:%.*]] = shufflevector <16 x i8> [[T6:%.*]], <16 x i8> undef, <4 x i32> <i32 undef, i32 9, i32 4, i32 8>
-; CHECK-NEXT:    [[T9:%.*]] = shufflevector <4 x i8> [[T7]], <4 x i8> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[T7:%.*]] = shufflevector <16 x i8> [[T6:%.*]], <16 x i8> poison, <4 x i32> <i32 undef, i32 9, i32 4, i32 8>
+; CHECK-NEXT:    [[T9:%.*]] = shufflevector <4 x i8> [[T7]], <4 x i8> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[T9]]
 ;
-  %t7 = shufflevector <16 x i8> %t6, <16 x i8> undef, <4 x i32> < i32 undef, i32 9, i32 4, i32 8 >
-  %t9 = shufflevector <4 x i8> %t7, <4 x i8> undef, <4 x i32> < i32 3, i32 1, i32 2, i32 0 >
+  %t7 = shufflevector <16 x i8> %t6, <16 x i8> poison, <4 x i32> < i32 undef, i32 9, i32 4, i32 8 >
+  %t9 = shufflevector <4 x i8> %t7, <4 x i8> poison, <4 x i32> < i32 3, i32 1, i32 2, i32 0 >
   ret <4 x i8> %t9
 }
 
@@ -134,18 +134,18 @@ define <4 x i8> @test9b(<4 x i8> %t6, <4 x i8> %t7) {
 ; CHECK-NEXT:    ret <4 x i8> [[T9]]
 ;
   %t1 = shufflevector <4 x i8> %t6, <4 x i8> %t7, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>
-  %t9 = shufflevector <8 x i8> %t1, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %t9 = shufflevector <8 x i8> %t1, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x i8> %t9
 }
 
 ; Redundant vector splats should be removed.  Radar 8597790.
 define <4 x i32> @test10(<4 x i32> %t5) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[T7:%.*]] = shufflevector <4 x i32> [[T5:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[T7:%.*]] = shufflevector <4 x i32> [[T5:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[T7]]
 ;
-  %t6 = shufflevector <4 x i32> %t5, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %t7 = shufflevector <4 x i32> %t6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %t6 = shufflevector <4 x i32> %t5, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %t7 = shufflevector <4 x i32> %t6, <4 x i32> poison, <4 x i32> zeroinitializer
   ret <4 x i32> %t7
 }
 
@@ -156,8 +156,8 @@ define <8 x i8> @test11(<16 x i8> %t6) {
 ; CHECK-NEXT:    [[T3:%.*]] = shufflevector <16 x i8> [[T6:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i8> [[T3]]
 ;
-  %t1 = shufflevector <16 x i8> %t6, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %t2 = shufflevector <16 x i8> %t6, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %t1 = shufflevector <16 x i8> %t6, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t2 = shufflevector <16 x i8> %t6, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %t3 = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %t3
 }
@@ -169,7 +169,7 @@ define <8 x i8> @test12(<8 x i8> %t6, <8 x i8> %t2) {
 ; CHECK-NEXT:    [[T3:%.*]] = shufflevector <8 x i8> [[T6:%.*]], <8 x i8> [[T2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 9, i32 8, i32 11, i32 12>
 ; CHECK-NEXT:    ret <8 x i8> [[T3]]
 ;
-  %t1 = shufflevector <8 x i8> %t6, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 undef, i32 7>
+  %t1 = shufflevector <8 x i8> %t6, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 undef, i32 7>
   %t3 = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 9, i32 8, i32 11, i32 12>
   ret <8 x i8> %t3
 }
@@ -181,7 +181,7 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) {
 ; CHECK-NEXT:    [[T3:%.*]] = shufflevector <8 x i8> [[T2:%.*]], <8 x i8> [[T6:%.*]], <8 x i32> <i32 0, i32 3, i32 1, i32 4, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x i8> [[T3]]
 ;
-  %t1 = shufflevector <8 x i8> %t6, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 undef, i32 7>
+  %t1 = shufflevector <8 x i8> %t6, <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 undef, i32 7>
   %t3 = shufflevector <8 x i8> %t2, <8 x i8> %t1, <8 x i32> <i32 0, i32 3, i32 1, i32 4, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i8> %t3
 }
@@ -194,7 +194,7 @@ define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    ret <2 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> <i32 0, i32 2, i32 0>
-  %extract_subv = shufflevector <3 x i8> %shuf, <3 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %extract_subv = shufflevector <3 x i8> %shuf, <3 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %extract_subv
 }
 
@@ -207,7 +207,7 @@ define <4 x i8> @extract_subvector_of_shuffle_undefs_types(<2 x i8> %x, <2 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
-  %extract_subv = shufflevector <5 x i8> %shuf, <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %extract_subv = shufflevector <5 x i8> %shuf, <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   ret <4 x i8> %extract_subv
 }
 
@@ -219,12 +219,12 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y
 ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    call void @use_v5i8(<5 x i8> [[SHUF]])
-; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[EXTRACT_SUBV]]
 ;
   %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> <i32 undef, i32 2, i32 0, i32 1, i32 0>
   call void @use_v5i8(<5 x i8> %shuf)
-  %extract_subv = shufflevector <5 x i8> %shuf, <5 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %extract_subv = shufflevector <5 x i8> %shuf, <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   ret <4 x i8> %extract_subv
 }
 
@@ -238,7 +238,7 @@ define <2 x i8> @test13a(i8 %x1, i8 %x2) {
   %A = insertelement <2 x i8> poison, i8 %x1, i32 0
   %B = insertelement <2 x i8> %A, i8 %x2, i32 1
   %C = add <2 x i8> %B, <i8 5, i8 7>
-  %D = shufflevector <2 x i8> %C, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+  %D = shufflevector <2 x i8> %C, <2 x i8> poison, <2 x i32> <i32 1, i32 0>
   ret <2 x i8> %D
 }
 
@@ -249,13 +249,13 @@ define <3 x i32> @add_wider(i32 %y, i32 %z) {
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <2 x i32> [[I0]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[I1]], <i32 255, i32 255>
-; CHECK-NEXT:    [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT]]
 ;
   %i0 = insertelement <2 x i32> poison, i32 %y, i32 0
   %i1 = insertelement <2 x i32> %i0, i32 %z, i32 1
   %a = add <2 x i32> %i1, <i32 255, i32 255>
-  %ext = shufflevector <2 x i32> %a, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+  %ext = shufflevector <2 x i32> %a, <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 undef>
   ret <3 x i32> %ext
 }
 
@@ -266,13 +266,13 @@ define <3 x i32> @div_wider(i32 %y, i32 %z) {
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <2 x i32> [[I0]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    [[A:%.*]] = sdiv <2 x i32> [[I1]], <i32 255, i32 255>
-; CHECK-NEXT:    [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 undef>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT]]
 ;
   %i0 = insertelement <2 x i32> poison, i32 %y, i32 0
   %i1 = insertelement <2 x i32> %i0, i32 %z, i32 1
   %a = sdiv <2 x i32> %i1, <i32 255, i32 255>
-  %ext = shufflevector <2 x i32> %a, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+  %ext = shufflevector <2 x i32> %a, <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 undef>
   ret <3 x i32> %ext
 }
 
@@ -286,17 +286,17 @@ define <3 x i8> @fold_inselts_with_widening_shuffle(i8 %x, i8 %y) {
 ;
   %ins0 = insertelement <2 x i8> poison, i8 %x, i32 0
   %ins1 = insertelement <2 x i8> %ins0, i8 %y, i32 1
-  %widen = shufflevector <2 x i8> %ins1, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+  %widen = shufflevector <2 x i8> %ins1, <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 undef>
   ret <3 x i8> %widen
 }
 
 define <2 x i8> @test13b(i8 %x) {
 ; CHECK-LABEL: @test13b(
-; CHECK-NEXT:    [[B:%.*]] = insertelement <2 x i8> undef, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[B:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
 ; CHECK-NEXT:    ret <2 x i8> [[B]]
 ;
   %A = insertelement <2 x i8> poison, i8 %x, i32 0
-  %B = shufflevector <2 x i8> %A, <2 x i8> undef, <2 x i32> <i32 undef, i32 0>
+  %B = shufflevector <2 x i8> %A, <2 x i8> poison, <2 x i32> <i32 undef, i32 0>
   ret <2 x i8> %B
 }
 
@@ -308,7 +308,7 @@ define <2 x i8> @test13c(i8 %x1, i8 %x2) {
 ;
   %A = insertelement <4 x i8> poison, i8 %x1, i32 0
   %B = insertelement <4 x i8> %A, i8 %x2, i32 2
-  %C = shufflevector <4 x i8> %B, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+  %C = shufflevector <4 x i8> %B, <4 x i8> poison, <2 x i32> <i32 0, i32 2>
   ret <2 x i8> %C
 }
 
@@ -325,7 +325,7 @@ define void @test14(i16 %conv10) {
   %div = udiv <4 x i16> %t1, %vecinit11
   store <4 x i16> %div, <4 x i16>* %t
   %t4 = load <4 x i16>, <4 x i16>* %t
-  %t5 = shufflevector <4 x i16> %t4, <4 x i16> undef, <2 x i32> <i32 2, i32 0>
+  %t5 = shufflevector <4 x i16> %t4, <4 x i16> poison, <2 x i32> <i32 2, i32 0>
   %cmp = icmp ule <2 x i16> %t5, undef
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret void
@@ -366,7 +366,7 @@ define <1 x i32> @test16a(i32 %ele) {
 ;
   %t0 = insertelement <2 x i32> <i32 1, i32 undef>, i32 %ele, i32 1
   %t1 = shl <2 x i32> %t0, <i32 1, i32 1>
-  %t2 = shufflevector <2 x i32> %t1, <2 x i32> undef, <1 x i32> <i32 0>
+  %t2 = shufflevector <2 x i32> %t1, <2 x i32> poison, <1 x i32> <i32 0>
   ret <1 x i32> %t2
 }
 
@@ -376,7 +376,7 @@ define <4 x i8> @test16b(i8 %ele) {
 ;
   %t0 = insertelement <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 undef, i8 1>, i8 %ele, i32 6
   %t1 = shl <8 x i8> %t0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %t2 = shufflevector <8 x i8> %t1, <8 x i8> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  %t2 = shufflevector <8 x i8> %t1, <8 x i8> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i8> %t2
 }
 
@@ -409,14 +409,14 @@ declare void @use(<2 x float>)
 
 define <2 x float> @shuffle_fadd_multiuse(<2 x float> %v1, <2 x float> %v2) {
 ; CHECK-LABEL: @shuffle_fadd_multiuse(
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V1:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V1:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[V1]], [[V2:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    call void @use(<2 x float> [[T1]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %t1 = shufflevector <2 x float> %v1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  %t2 = shufflevector <2 x float> %v2, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x float> %v1, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %t2 = shufflevector <2 x float> %v2, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %r = fadd <2 x float> %t1, %t2
   call void @use(<2 x float> %t1)
   ret <2 x float> %r
@@ -424,14 +424,14 @@ define <2 x float> @shuffle_fadd_multiuse(<2 x float> %v1, <2 x float> %v2) {
 
 define <2 x float> @shuffle_fdiv_multiuse(<2 x float> %v1, <2 x float> %v2) {
 ; CHECK-LABEL: @shuffle_fdiv_multiuse(
-; CHECK-NEXT:    [[T2:%.*]] = shufflevector <2 x float> [[V2:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = shufflevector <2 x float> [[V2:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x float> [[V1:%.*]], [[V2]]
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    call void @use(<2 x float> [[T2]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %t1 = shufflevector <2 x float> %v1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  %t2 = shufflevector <2 x float> %v2, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x float> %v1, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %t2 = shufflevector <2 x float> %v2, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %r = fdiv <2 x float> %t1, %t2
   call void @use(<2 x float> %t2)
   ret <2 x float> %r
@@ -441,15 +441,15 @@ define <2 x float> @shuffle_fdiv_multiuse(<2 x float> %v1, <2 x float> %v2) {
 
 define <2 x float> @shuffle_fsub_multiuse(<2 x float> %v1, <2 x float> %v2) {
 ; CHECK-LABEL: @shuffle_fsub_multiuse(
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V1:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[T2:%.*]] = shufflevector <2 x float> [[V2:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V1:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = shufflevector <2 x float> [[V2:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[R:%.*]] = fsub <2 x float> [[T1]], [[T2]]
 ; CHECK-NEXT:    call void @use(<2 x float> [[T1]])
 ; CHECK-NEXT:    call void @use(<2 x float> [[T2]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %t1 = shufflevector <2 x float> %v1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  %t2 = shufflevector <2 x float> %v2, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x float> %v1, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %t2 = shufflevector <2 x float> %v2, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %r = fsub <2 x float> %t1, %t2
   call void @use(<2 x float> %t1)
   call void @use(<2 x float> %t2)
@@ -510,7 +510,7 @@ define <4 x i32> @add_const(<4 x i32> %v) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %t1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   %r = add <4 x i32> %t1, <i32 41, i32 42, i32 43, i32 44>
   ret <4 x i32> %r
 }
@@ -521,7 +521,7 @@ define <4 x i32> @sub_const(<4 x i32> %v) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %t1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %r = sub <4 x i32> <i32 41, i32 42, i32 43, i32 44>, %t1
   ret <4 x i32> %r
 }
@@ -530,12 +530,12 @@ define <4 x i32> @sub_const(<4 x i32> %v) {
 
 define <2 x float> @fadd_const_multiuse(<2 x float> %v) {
 ; CHECK-LABEL: @fadd_const_multiuse(
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[T1]], <float 4.100000e+01, float 4.200000e+01>
 ; CHECK-NEXT:    call void @use(<2 x float> [[T1]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %t1 = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x float> %v, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %r = fadd <2 x float> %t1, <float 41.0, float 42.0>
   call void @use(<2 x float> %t1)
   ret <2 x float> %r
@@ -549,7 +549,7 @@ define <4 x i32> @mul_const_splat(<4 x i32> %v) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %t1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %r = mul <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %t1
   ret <4 x i32> %r
 }
@@ -562,7 +562,7 @@ define <4 x i32> @lshr_const_half_splat(<4 x i32> %v) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %t1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
   %r = lshr <4 x i32> <i32 8, i32 8, i32 9, i32 9>, %t1
   ret <4 x i32> %r
 }
@@ -571,11 +571,11 @@ define <4 x i32> @lshr_const_half_splat(<4 x i32> %v) {
 
 define <2 x float> @fmul_const_invalid_constant(<2 x float> %v) {
 ; CHECK-LABEL: @fmul_const_invalid_constant(
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[T1]], <float 4.100000e+01, float 4.200000e+01>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %t1 = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 0, i32 0>
+  %t1 = shufflevector <2 x float> %v, <2 x float> poison, <2 x i32> <i32 0, i32 0>
   %r = fmul <2 x float> %t1, <float 41.0, float 42.0>
   ret <2 x float> %r
 }
@@ -588,7 +588,7 @@ define <4 x i8> @widening_shuffle_add_1(<2 x i8> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
-  %widex = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widex = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %r = add <4 x i8> %widex, <i8 42, i8 43, i8 44, i8 45>
   ret <4 x i8> %r
 }
@@ -601,7 +601,7 @@ define <4 x i8> @widening_shuffle_add_2(<2 x i8> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
-  %widex = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+  %widex = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
   %r = add <4 x i8> %widex, <i8 42, i8 43, i8 44, i8 45>
   ret <4 x i8> %r
 }
@@ -610,11 +610,11 @@ define <4 x i8> @widening_shuffle_add_2(<2 x i8> %x) {
 
 define <4 x i8> @widening_shuffle_add_invalid_constant(<2 x i8> %x) {
 ; CHECK-LABEL: @widening_shuffle_add_invalid_constant(
-; CHECK-NEXT:    [[WIDEX:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[WIDEX:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i8> [[WIDEX]], <i8 42, i8 43, i8 44, i8 45>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
-  %widex = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+  %widex = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
   %r = add <4 x i8> %widex, <i8 42, i8 43, i8 44, i8 45>
   ret <4 x i8> %r
 }
@@ -623,11 +623,11 @@ define <4 x i8> @widening_shuffle_add_invalid_constant(<2 x i8> %x) {
 
 define <4 x i8> @widening_shuffle_add_invalid_mask(<2 x i8> %x) {
 ; CHECK-LABEL: @widening_shuffle_add_invalid_mask(
-; CHECK-NEXT:    [[WIDEX:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[WIDEX:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 0>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i8> [[WIDEX]], <i8 42, i8 43, i8 44, i8 45>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
-  %widex = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 0>
+  %widex = shufflevector <2 x i8> %x, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 0>
   %r = add <4 x i8> %widex, <i8 42, i8 43, i8 44, i8 45>
   ret <4 x i8> %r
 }
@@ -641,7 +641,7 @@ define <4 x i16> @widening_shuffle_shl_constant_op0(<2 x i16> %v) {
 ; CHECK-NEXT:    [[BO:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i16> [[BO]]
 ;
-  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %bo = shl <4 x i16> <i16 42, i16 -42, i16 -1, i16 -1>, %shuf
   ret <4 x i16> %bo
 }
@@ -655,7 +655,7 @@ define <4 x i16> @widening_shuffle_shl_constant_op1(<2 x i16> %v) {
 ; CHECK-NEXT:    [[BO:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i16> [[BO]]
 ;
-  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %bo = shl <4 x i16> %shuf, <i16 2, i16 4, i16 0, i16 0>
   ret <4 x i16> %bo
 }
@@ -665,11 +665,11 @@ define <4 x i16> @widening_shuffle_shl_constant_op1(<2 x i16> %v) {
 
 define <4 x i16> @widening_shuffle_shl_constant_op1_non0(<2 x i16> %v) {
 ; CHECK-LABEL: @widening_shuffle_shl_constant_op1_non0(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BO:%.*]] = shl <4 x i16> [[SHUF]], <i16 2, i16 4, i16 1, i16 2>
 ; CHECK-NEXT:    ret <4 x i16> [[BO]]
 ;
-  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %bo = shl <4 x i16> %shuf, <i16 2, i16 4, i16 1, i16 2>
   ret <4 x i16> %bo
 }
@@ -679,11 +679,11 @@ define <4 x i16> @widening_shuffle_shl_constant_op1_non0(<2 x i16> %v) {
 
 define <4 x i16> @widening_shuffle_or(<2 x i16> %v) {
 ; CHECK-LABEL: @widening_shuffle_or(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i16> [[V:%.*]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BO:%.*]] = or <4 x i16> [[SHUF]], <i16 42, i16 -42, i16 -1, i16 -1>
 ; CHECK-NEXT:    ret <4 x i16> [[BO]]
 ;
-  %shuf = shufflevector <2 x i16> %v, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuf = shufflevector <2 x i16> %v, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %bo = or <4 x i16> %shuf, <i16 42, i16 -42, i16 -1, i16 -1>
   ret <4 x i16> %bo
 }
@@ -739,12 +739,12 @@ define <4 x i16> @pr19717a(<8 x i16> %in0, <8 x i16> %in1) {
 
 define <8 x i8> @pr19730(<16 x i8> %in0) {
 ; CHECK-LABEL: @pr19730(
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[IN0:%.*]], <16 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i8> [[SHUFFLE]], <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[IN0:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i8> [[SHUFFLE]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[SHUFFLE1]]
 ;
-  %shuffle = shufflevector <16 x i8> %in0, <16 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %shuffle1 = shufflevector <8 x i8> %shuffle, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuffle = shufflevector <16 x i8> %in0, <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <8 x i8> %shuffle, <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i8> %shuffle1
 }
 
@@ -766,34 +766,34 @@ define i32 @pr19737(<4 x i32> %in0) {
 
 define <4 x i32> @pr20059(<4 x i32> %p1, <4 x i32> %p2) {
 ; CHECK-LABEL: @pr20059(
-; CHECK-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[P1:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[P2:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[P1:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[P2:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[RETVAL:%.*]] = srem <4 x i32> [[SPLAT1]], [[SPLAT2]]
 ; CHECK-NEXT:    ret <4 x i32> [[RETVAL]]
 ;
-  %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat1 = shufflevector <4 x i32> %p1, <4 x i32> poison, <4 x i32> zeroinitializer
+  %splat2 = shufflevector <4 x i32> %p2, <4 x i32> poison, <4 x i32> zeroinitializer
   %retval = srem <4 x i32> %splat1, %splat2
   ret <4 x i32> %retval
 }
 
 define <4 x i32> @pr20114(<4 x i32> %__mask) {
 ; CHECK-LABEL: @pr20114(
-; CHECK-NEXT:    [[MASK01_I:%.*]] = shufflevector <4 x i32> [[__MASK:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[MASK01_I:%.*]] = shufflevector <4 x i32> [[__MASK:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[MASKED_NEW_I_I_I:%.*]] = and <4 x i32> [[MASK01_I]], bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64)> to <4 x i32>)
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED_NEW_I_I_I]]
 ;
-  %mask01.i = shufflevector <4 x i32> %__mask, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %mask01.i = shufflevector <4 x i32> %__mask, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
   %masked_new.i.i.i = and <4 x i32> bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64)> to <4 x i32>), %mask01.i
   ret <4 x i32> %masked_new.i.i.i
 }
 
 define <2 x i32*> @pr23113(<4 x i32*> %A) {
 ; CHECK-LABEL: @pr23113(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32*> [[A:%.*]], <4 x i32*> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32*> [[A:%.*]], <4 x i32*> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x i32*> [[TMP1]]
 ;
-  %1 = shufflevector <4 x i32*> %A, <4 x i32*> undef, <2 x i32> <i32 0, i32 1>
+  %1 = shufflevector <4 x i32*> %A, <4 x i32*> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i32*> %1
 }
 
@@ -803,7 +803,7 @@ define <2 x i32> @PR37648(<2 x i32> %x) {
 ; CHECK-LABEL: @PR37648(
 ; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = urem <2 x i32> %splat, <i32 1, i32 1>
   ret <2 x i32> %r
 }
@@ -817,7 +817,7 @@ define <2 x i32> @add_splat_constant(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = add <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -828,7 +828,7 @@ define <2 x i32> @sub_splat_constant0(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = sub <2 x i32> <i32 42, i32 42>, %splat
   ret <2 x i32> %r
 }
@@ -839,7 +839,7 @@ define <2 x i32> @sub_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = sub <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -850,7 +850,7 @@ define <2 x i32> @mul_splat_constant(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = mul <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -861,7 +861,7 @@ define <2 x i32> @shl_splat_constant0(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = shl <2 x i32> <i32 5, i32 5>, %splat
   ret <2 x i32> %r
 }
@@ -872,7 +872,7 @@ define <2 x i32> @shl_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = shl <2 x i32> %splat, <i32 5, i32 5>
   ret <2 x i32> %r
 }
@@ -883,7 +883,7 @@ define <2 x i32> @ashr_splat_constant0(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = ashr <2 x i32> <i32 5, i32 5>, %splat
   ret <2 x i32> %r
 }
@@ -894,7 +894,7 @@ define <2 x i32> @ashr_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = ashr <2 x i32> %splat, <i32 5, i32 5>
   ret <2 x i32> %r
 }
@@ -905,7 +905,7 @@ define <2 x i32> @lshr_splat_constant0(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = lshr <2 x i32> <i32 5, i32 5>, %splat
   ret <2 x i32> %r
 }
@@ -916,18 +916,18 @@ define <2 x i32> @lshr_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = lshr <2 x i32> %splat, <i32 5, i32 5>
   ret <2 x i32> %r
 }
 
 define <2 x i32> @urem_splat_constant0(<2 x i32> %x) {
 ; CHECK-LABEL: @urem_splat_constant0(
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = urem <2 x i32> <i32 42, i32 42>, [[SPLAT]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = urem <2 x i32> <i32 42, i32 42>, %splat
   ret <2 x i32> %r
 }
@@ -938,18 +938,18 @@ define <2 x i32> @urem_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = urem <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
 
 define <2 x i32> @srem_splat_constant0(<2 x i32> %x) {
 ; CHECK-LABEL: @srem_splat_constant0(
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = srem <2 x i32> <i32 42, i32 42>, [[SPLAT]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = srem <2 x i32> <i32 42, i32 42>, %splat
   ret <2 x i32> %r
 }
@@ -960,18 +960,18 @@ define <2 x i32> @srem_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = srem <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
 
 define <2 x i32> @udiv_splat_constant0(<2 x i32> %x) {
 ; CHECK-LABEL: @udiv_splat_constant0(
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = udiv <2 x i32> <i32 42, i32 42>, [[SPLAT]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = udiv <2 x i32> <i32 42, i32 42>, %splat
   ret <2 x i32> %r
 }
@@ -982,18 +982,18 @@ define <2 x i32> @udiv_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = udiv <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
 
 define <2 x i32> @sdiv_splat_constant0(<2 x i32> %x) {
 ; CHECK-LABEL: @sdiv_splat_constant0(
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = sdiv <2 x i32> <i32 42, i32 42>, [[SPLAT]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = sdiv <2 x i32> <i32 42, i32 42>, %splat
   ret <2 x i32> %r
 }
@@ -1004,7 +1004,7 @@ define <2 x i32> @sdiv_splat_constant1(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = sdiv <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -1015,7 +1015,7 @@ define <2 x i32> @and_splat_constant(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = and <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -1025,12 +1025,12 @@ define <2 x i32> @and_splat_constant(<2 x i32> %x) {
 define <4 x i16> @and_constant_mask_undef(<4 x i16> %add) {
 ; CHECK-LABEL: @and_constant_mask_undef(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ADD:%.*]], <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ADD:%.*]], <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
 ; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[SHUFFLE]], <i16 0, i16 0, i16 -1, i16 -1>
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %add, <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %add, <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
   %and = and <4 x i16> %shuffle, <i16 0, i16 0, i16 -1, i16 -1>
   ret <4 x i16> %and
 }
@@ -1040,12 +1040,12 @@ entry:
 define <4 x i16> @and_constant_mask_undef_2(<4 x i16> %add) {
 ; CHECK-LABEL: @and_constant_mask_undef_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ADD:%.*]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ADD:%.*]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 undef>
 ; CHECK-NEXT:    [[AND:%.*]] = and <4 x i16> [[SHUFFLE]], <i16 -1, i16 -1, i16 -1, i16 0>
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %add, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %add, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 undef>
   %and = and <4 x i16> %shuffle, <i16 -1, i16 -1, i16 -1, i16 -0>
   ret <4 x i16> %and
 }
@@ -1057,7 +1057,7 @@ define <4 x i16> @and_constant_mask_undef_3(<4 x i16> %add) {
 ; CHECK-NEXT:    ret <4 x i16> <i16 0, i16 0, i16 0, i16 undef>
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %add, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %add, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
   %and = and <4 x i16> %shuffle, <i16 0, i16 0, i16 0, i16 -1>
   ret <4 x i16> %and
 }
@@ -1071,7 +1071,7 @@ define <4 x i16> @and_constant_mask_undef_4(<4 x i16> %add) {
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %add, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %add, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
   %and = and <4 x i16> %shuffle, <i16 9, i16 20, i16 20, i16 -1>
   ret <4 x i16> %and
 }
@@ -1084,7 +1084,7 @@ define <4 x i16> @and_constant_mask_not_undef(<4 x i16> %add) {
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %add, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %add, <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 1>
   %and = and <4 x i16> %shuffle, <i16 0, i16 0, i16 -1, i16 -1>
   ret <4 x i16> %and
 }
@@ -1094,12 +1094,12 @@ entry:
 define <4 x i16> @or_constant_mask_undef(<4 x i16> %in) {
 ; CHECK-LABEL: @or_constant_mask_undef(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
 ; CHECK-NEXT:    [[OR:%.*]] = or <4 x i16> [[SHUFFLE]], <i16 -1, i16 -1, i16 0, i16 0>
 ; CHECK-NEXT:    ret <4 x i16> [[OR]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
   %or = or <4 x i16> %shuffle, <i16 -1, i16 -1, i16 0, i16 0>
   ret <4 x i16> %or
 }
@@ -1109,12 +1109,12 @@ entry:
 define <4 x i16> @or_constant_mask_undef_2(<4 x i16> %in) {
 ; CHECK-LABEL: @or_constant_mask_undef_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
 ; CHECK-NEXT:    [[OR:%.*]] = or <4 x i16> [[SHUFFLE]], <i16 -1, i16 0, i16 0, i16 -1>
 ; CHECK-NEXT:    ret <4 x i16> [[OR]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
   %or = or <4 x i16> %shuffle, <i16 -1, i16 0, i16 0, i16 -1>
   ret <4 x i16> %or
 }
@@ -1126,7 +1126,7 @@ define <4 x i16> @or_constant_mask_undef_3(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> <i16 undef, i16 -1, i16 -1, i16 undef>
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
   %or = or <4 x i16> %shuffle, <i16 0, i16 -1, i16 -1, i16 0>
   ret <4 x i16> %or
 }
@@ -1140,7 +1140,7 @@ define <4 x i16> @or_constant_mask_undef_4(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[OR]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
   %or = or <4 x i16> %shuffle, <i16 0, i16 99, i16 99, i16 0>
   ret <4 x i16> %or
 }
@@ -1153,7 +1153,7 @@ define <4 x i16> @or_constant_mask_not_undef(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 1>
   %and = or <4 x i16> %shuffle, <i16 0, i16 0, i16 -1, i16 -1>
   ret <4 x i16> %and
 }
@@ -1161,12 +1161,12 @@ entry:
 define <4 x i16> @shl_constant_mask_undef(<4 x i16> %in) {
 ; CHECK-LABEL: @shl_constant_mask_undef(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[IN:%.*]], <4 x i16> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 1>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i16> [[SHUFFLE]], <i16 10, i16 3, i16 0, i16 0>
 ; CHECK-NEXT:    ret <4 x i16> [[SHL]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 1>
   %shl = shl <4 x i16> %shuffle, <i16 10, i16 3, i16 0, i16 0>
   ret <4 x i16> %shl
 }
@@ -1178,7 +1178,7 @@ define <4 x i16> @add_constant_mask_undef(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[ADD]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
   %add = add <4 x i16> %shuffle, <i16 10, i16 3, i16 0, i16 0>
   ret <4 x i16> %add
 }
@@ -1191,7 +1191,7 @@ define <4 x i16> @add_constant_mask_undef_2(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[ADD]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 1>
   %add = add <4 x i16> %shuffle, <i16 10, i16 3, i16 0, i16 0>
   ret <4 x i16> %add
 }
@@ -1203,7 +1203,7 @@ define <4 x i16> @sub_constant_mask_undef(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[SUB]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 1>
   %sub = sub <4 x i16> %shuffle, <i16 10, i16 3, i16 0, i16 0>
   ret <4 x i16> %sub
 }
@@ -1216,7 +1216,7 @@ define <4 x i16> @sub_constant_mask_undef_2(<4 x i16> %in) {
 ; CHECK-NEXT:    ret <4 x i16> [[SUB]]
 ;
 entry:
-  %shuffle = shufflevector <4 x i16> %in, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 undef>
+  %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 undef>
   %sub = sub <4 x i16> %shuffle, <i16 0, i16 0, i16 10, i16 99>
   ret <4 x i16> %sub
 }
@@ -1227,7 +1227,7 @@ define <2 x i32> @or_splat_constant(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = or <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -1238,7 +1238,7 @@ define <2 x i32> @xor_splat_constant(<2 x i32> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %splat = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> zeroinitializer
   %r = xor <2 x i32> %splat, <i32 42, i32 42>
   ret <2 x i32> %r
 }
@@ -1249,7 +1249,7 @@ define <2 x float> @fadd_splat_constant(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fadd <2 x float> %splat, <float 42.0, float 42.0>
   ret <2 x float> %r
 }
@@ -1260,7 +1260,7 @@ define <2 x float> @fsub_splat_constant0(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fsub <2 x float> <float 42.0, float 42.0>, %splat
   ret <2 x float> %r
 }
@@ -1271,7 +1271,7 @@ define <2 x float> @fsub_splat_constant1(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fsub <2 x float> %splat, <float 42.0, float 42.0>
   ret <2 x float> %r
 }
@@ -1282,7 +1282,7 @@ define <2 x float> @fneg(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fsub <2 x float> <float -0.0, float -0.0>, %splat
   ret <2 x float> %r
 }
@@ -1293,7 +1293,7 @@ define <2 x float> @fmul_splat_constant(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fmul <2 x float> %splat, <float 42.0, float 42.0>
   ret <2 x float> %r
 }
@@ -1304,7 +1304,7 @@ define <2 x float> @fdiv_splat_constant0(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fdiv <2 x float> <float 42.0, float 42.0>, %splat
   ret <2 x float> %r
 }
@@ -1315,7 +1315,7 @@ define <2 x float> @fdiv_splat_constant1(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = fdiv <2 x float> %splat, <float 42.0, float 42.0>
   ret <2 x float> %r
 }
@@ -1326,7 +1326,7 @@ define <2 x float> @frem_splat_constant0(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = frem <2 x float> <float 42.0, float 42.0>, %splat
   ret <2 x float> %r
 }
@@ -1337,7 +1337,7 @@ define <2 x float> @frem_splat_constant1(<2 x float> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splat = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splat = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %r = frem <2 x float> %splat, <float 42.0, float 42.0>
   ret <2 x float> %r
 }
@@ -1347,12 +1347,12 @@ define <2 x float> @frem_splat_constant1(<2 x float> %x) {
 define <2 x i1> @PR40734(<1 x i1> %x, <4 x i1> %y) {
 ; CHECK-LABEL: @PR40734(
 ; CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <1 x i1> zeroinitializer, <1 x i1> [[X:%.*]], <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[NARROW:%.*]] = shufflevector <4 x i1> [[Y:%.*]], <4 x i1> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[NARROW:%.*]] = shufflevector <4 x i1> [[Y:%.*]], <4 x i1> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[WIDEN]], [[NARROW]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %widen = shufflevector <1 x i1> zeroinitializer, <1 x i1> %x, <2 x i32> <i32 0, i32 1>
-  %narrow = shufflevector <4 x i1> %y, <4 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %narrow = shufflevector <4 x i1> %y, <4 x i1> poison, <2 x i32> <i32 0, i32 1>
   %r = and <2 x i1> %widen, %narrow
   ret <2 x i1> %r
 }
@@ -1361,13 +1361,13 @@ define <2 x i1> @PR40734(<1 x i1> %x, <4 x i1> %y) {
 
 define <7 x i8> @insert_subvector_shuffles(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> undef, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> poison, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <7 x i8> [[S1]], <7 x i8> [[S2]], <7 x i32> <i32 0, i32 8, i32 1, i32 undef, i32 8, i32 1, i32 9>
 ; CHECK-NEXT:    ret <7 x i8> [[S3]]
 ;
-  %s1 = shufflevector <3 x i8> %x, <3 x i8> undef, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %s2 = shufflevector <3 x i8> %y, <3 x i8> undef, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <3 x i8> %x, <3 x i8> poison, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s2 = shufflevector <3 x i8> %y, <3 x i8> poison, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <7 x i8> %s1, <7 x i8> %s2, <7 x i32> <i32 0, i32 8, i32 1, i32 undef, i32 8, i32 1, i32 9>
   ret <7 x i8> %s3
 }
@@ -1377,8 +1377,8 @@ define <8 x i8> @insert_subvector_shuffles_pow2elts(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <8 x i32> <i32 0, i32 2, i32 1, i32 undef, i32 2, i32 1, i32 3, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[S3]]
 ;
-  %s1 = shufflevector <2 x i8> %x, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %s2 = shufflevector <2 x i8> %y, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <2 x i8> %x, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s2 = shufflevector <2 x i8> %y, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <8 x i32> <i32 0, i32 8, i32 1, i32 undef, i32 8, i32 1, i32 9, i32 0>
   ret <8 x i8> %s3
 }
@@ -1388,13 +1388,13 @@ define <8 x i8> @insert_subvector_shuffles_pow2elts(<2 x i8> %x, <2 x i8> %y) {
 
 define <2 x i8> @insert_subvector_shuffles_narrowing(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @insert_subvector_shuffles_narrowing(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <7 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> undef, <7 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <7 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> poison, <7 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <7 x i8> [[S1]], <7 x i8> [[S2]], <2 x i32> <i32 0, i32 8>
 ; CHECK-NEXT:    ret <2 x i8> [[S3]]
 ;
-  %s1 = shufflevector <3 x i8> %x, <3 x i8> undef, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %s2 = shufflevector <3 x i8> %y, <3 x i8> undef, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <3 x i8> %x, <3 x i8> poison, <7 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s2 = shufflevector <3 x i8> %y, <3 x i8> poison, <7 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <7 x i8> %s1, <7 x i8> %s2, <2 x i32> <i32 0, i32 8>
   ret <2 x i8> %s3
 }
@@ -1404,8 +1404,8 @@ define <2 x i8> @insert_subvector_shuffles_narrowing_pow2elts(<4 x i8> %x, <4 x
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    ret <2 x i8> [[S3]]
 ;
-  %s1 = shufflevector <4 x i8> %x, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %s2 = shufflevector <4 x i8> %y, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <4 x i8> %x, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s2 = shufflevector <4 x i8> %y, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <2 x i32> <i32 0, i32 8>
   ret <2 x i8> %s3
 }
@@ -1417,8 +1417,8 @@ define <4 x double> @insert_subvector_shuffles_identity(<2 x double> %x) {
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x double> [[S3]]
 ;
-  %s1 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
-  %s2 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <2 x double> %x, <2 x double> poison, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+  %s2 = shufflevector <2 x double> %x, <2 x double> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <4 x double> %s2, <4 x double> %s1, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   ret <4 x double> %s3
 }
@@ -1427,13 +1427,13 @@ define <4 x double> @insert_subvector_shuffles_identity(<2 x double> %x) {
 
 define <4 x double> @not_insert_subvector_shuffle(<2 x double> %x) {
 ; CHECK-LABEL: @not_insert_subvector_shuffle(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 1>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <4 x i32> <i32 undef, i32 1, i32 undef, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x double> [[S2]], <4 x double> [[S1]], <4 x i32> <i32 0, i32 5, i32 7, i32 undef>
 ; CHECK-NEXT:    ret <4 x double> [[S3]]
 ;
-  %s1 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 1>
-  %s2 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <2 x double> %x, <2 x double> poison, <4 x i32> <i32 undef, i32 1, i32 undef, i32 1>
+  %s2 = shufflevector <2 x double> %x, <2 x double> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <4 x double> %s2, <4 x double> %s1, <4 x i32> <i32 0, i32 5, i32 7, i32 undef>
   ret <4 x double> %s3
 }
@@ -1442,13 +1442,13 @@ define <4 x double> @not_insert_subvector_shuffle(<2 x double> %x) {
 
 define <4 x double> @not_insert_subvector_shuffles_with_same_size(<2 x double> %x, <3 x double> %y) {
 ; CHECK-LABEL: @not_insert_subvector_shuffles_with_same_size(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x double> [[Y:%.*]], <3 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <3 x double> [[Y:%.*]], <3 x double> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x double> [[S2]], <4 x double> [[S1]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x double> [[S3]]
 ;
-  %s1 = shufflevector <2 x double> %x, <2 x double> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
-  %s2 = shufflevector <3 x double> %y, <3 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %s1 = shufflevector <2 x double> %x, <2 x double> poison, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+  %s2 = shufflevector <3 x double> %y, <3 x double> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %s3 = shufflevector <4 x double> %s2, <4 x double> %s1, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
   ret <4 x double> %s3
 }
@@ -1458,12 +1458,12 @@ define <4 x double> @not_insert_subvector_shuffles_with_same_size(<2 x double> %
 
 define <4 x float> @insert_subvector_crash_invalid_mask_elt(<2 x float> %x, <4 x float>* %p) {
 ; CHECK-LABEL: @insert_subvector_crash_invalid_mask_elt(
-; CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[I:%.*]] = shufflevector <2 x float> [[X]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[I]], <4 x float>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[WIDEN]]
 ;
-  %widen = shufflevector <2 x float> %x, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %widen = shufflevector <2 x float> %x, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %ext2 = extractelement <2 x float> %x, i32 0
   %I = insertelement <4 x float> %widen, float %ext2, i16 0
   store <4 x float> %I, <4 x float>* %p
@@ -1477,7 +1477,7 @@ define <4 x i32> @splat_assoc_add(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %a = add <4 x i32> %y, <i32 317426, i32 317426, i32 317426, i32 317426>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1491,7 +1491,7 @@ define <vscale x 4 x i32> @vsplat_assoc_add(<vscale x 4 x i32> %x, <vscale x 4 x
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
 ;
 
-  %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %a = add <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
   %r = add <vscale x 4 x i32> %splatx, %a
   ret <vscale x 4 x i32> %r
@@ -1506,7 +1506,7 @@ define <4 x i32> @splat_assoc_add_undef_mask_elts(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
   %a = add <4 x i32> %y, <i32 42, i32 42, i32 42, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1521,7 +1521,7 @@ define <4 x i32> @splat_assoc_add_undef_mask_elt_at_splat_index(<4 x i32> %x, <4
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
   %a = add <4 x i32> %y, <i32 42, i32 42, i32 42, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1529,12 +1529,12 @@ define <4 x i32> @splat_assoc_add_undef_mask_elt_at_splat_index(<4 x i32> %x, <4
 
 define <4 x i32> @splat_assoc_add_undef_constant_elts(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_undef_constant_elts(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 42, i32 undef, i32 undef, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %a = add <4 x i32> %y, <i32 42, i32 undef, i32 undef, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1542,12 +1542,12 @@ define <4 x i32> @splat_assoc_add_undef_constant_elts(<4 x i32> %x, <4 x i32> %y
 
 define <4 x i32> @splat_assoc_add_undef_constant_elt_at_splat_index(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_undef_constant_elt_at_splat_index(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 undef, i32 42, i32 undef, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %a = add <4 x i32> %y, <i32 undef, i32 42, i32 undef, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1555,12 +1555,12 @@ define <4 x i32> @splat_assoc_add_undef_constant_elt_at_splat_index(<4 x i32> %x
 
 define <4 x i32> @splat_assoc_add_undef_mask_elts_undef_constant_elts(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_undef_mask_elts_undef_constant_elts(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 42, i32 undef, i32 undef, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
   %a = add <4 x i32> %y, <i32 42, i32 undef, i32 undef, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1568,12 +1568,12 @@ define <4 x i32> @splat_assoc_add_undef_mask_elts_undef_constant_elts(<4 x i32>
 
 define <4 x i32> @splat_assoc_add_undef_mask_elt_at_splat_index_undef_constant_elts(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_undef_mask_elt_at_splat_index_undef_constant_elts(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 42, i32 undef, i32 undef, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
   %a = add <4 x i32> %y, <i32 42, i32 undef, i32 undef, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1581,12 +1581,12 @@ define <4 x i32> @splat_assoc_add_undef_mask_elt_at_splat_index_undef_constant_e
 
 define <4 x i32> @splat_assoc_add_undef_mask_elt_at_splat_index_undef_constant_elt_at_splat_index(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_undef_mask_elt_at_splat_index_undef_constant_elt_at_splat_index(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 undef, i32 42, i32 undef, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
   %a = add <4 x i32> %y, <i32 undef, i32 42, i32 undef, i32 42>
   %r = add <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1601,7 +1601,7 @@ define <2 x float> @splat_assoc_fmul(<2 x float> %x, <2 x float> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz <2 x float> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splatx = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %splatx = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 1>
   %a = fmul reassoc nsz <2 x float> %y, <float 3.0, float 3.0>
   %r = fmul reassoc nsz nnan <2 x float> %a, %splatx
   ret <2 x float> %r
@@ -1616,8 +1616,8 @@ define <3 x i8> @splat_assoc_mul(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) {
 ; CHECK-NEXT:    [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
   %a = mul nsw <3 x i8> %y, %splatz
   %r = mul <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1630,8 +1630,8 @@ define <3 x i8> @splat_assoc_mul_undef_elt1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %
 ; CHECK-NEXT:    [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 undef, i32 2, i32 2>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 undef, i32 2, i32 2>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
   %a = mul nsw <3 x i8> %y, %splatz
   %r = mul nsw nuw <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1639,14 +1639,14 @@ define <3 x i8> @splat_assoc_mul_undef_elt1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %
 
 define <3 x i8> @splat_assoc_mul_undef_elt2(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) {
 ; CHECK-LABEL: @splat_assoc_mul_undef_elt2(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> undef, <3 x i32> <i32 undef, i32 2, i32 2>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> <i32 undef, i32 2, i32 2>
 ; CHECK-NEXT:    [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 undef, i32 2, i32 2>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 undef, i32 2, i32 2>
   %a = mul nsw <3 x i8> %y, %splatz
   %r = mul nsw nuw <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1659,8 +1659,8 @@ define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index1(<3 x i8> %x, <3 x i8>
 ; CHECK-NEXT:    [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 undef>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 undef>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
   %a = mul nsw <3 x i8> %y, %splatz
   %r = mul nsw nuw <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1668,14 +1668,14 @@ define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index1(<3 x i8> %x, <3 x i8>
 
 define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index2(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) {
 ; CHECK-LABEL: @splat_assoc_mul_undef_elt_at_splat_index2(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 undef>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 undef>
 ; CHECK-NEXT:    [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 undef>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 undef>
   %a = mul nsw <3 x i8> %y, %splatz
   %r = mul nsw nuw <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1685,14 +1685,14 @@ define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index2(<3 x i8> %x, <3 x i8>
 
 define <3 x i8> @splat_assoc_or(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) {
 ; CHECK-LABEL: @splat_assoc_or(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[A:%.*]] = or <3 x i8> [[SPLATZ]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = or <3 x i8> [[A]], [[SPLATX]]
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %splatx = shufflevector <3 x i8> %x, <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %splatz = shufflevector <3 x i8> %z, <3 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
+  %splatx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  %splatz = shufflevector <3 x i8> %z, <3 x i8> poison, <3 x i32> <i32 2, i32 2, i32 2>
   %a = or <3 x i8> %y, %splatz
   %r = or <3 x i8> %a, %splatx
   ret <3 x i8> %r
@@ -1702,12 +1702,12 @@ define <3 x i8> @splat_assoc_or(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) {
 
 define <2 x float> @splat_assoc_fdiv(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @splat_assoc_fdiv(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = fdiv reassoc nsz <2 x float> [[Y:%.*]], <float 3.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    [[R:%.*]] = fdiv reassoc nsz <2 x float> [[A]], [[SPLATX]]
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splatx = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> zeroinitializer
+  %splatx = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> zeroinitializer
   %a = fdiv reassoc nsz <2 x float> %y, <float 3.0, float 3.0>
   %r = fdiv reassoc nsz <2 x float> %a, %splatx
   ret <2 x float> %r
@@ -1717,13 +1717,13 @@ define <2 x float> @splat_assoc_fdiv(<2 x float> %x, <2 x float> %y) {
 
 define <2 x float> @splat_assoc_fadd(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @splat_assoc_fadd(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[A:%.*]] = fadd fast <2 x float> [[Y:%.*]], <float 3.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    call void @use(<2 x float> [[A]])
 ; CHECK-NEXT:    [[R:%.*]] = fadd fast <2 x float> [[A]], [[SPLATX]]
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %splatx = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %splatx = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 1>
   %a = fadd fast <2 x float> %y, <float 3.0, float 3.0>
   call void @use(<2 x float> %a)
   %r = fadd fast <2 x float> %a, %splatx
@@ -1734,12 +1734,12 @@ define <2 x float> @splat_assoc_fadd(<2 x float> %x, <2 x float> %y) {
 
 define <3 x i32> @splat_assoc_and(<4 x i32> %x, <3 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_and(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = and <3 x i32> [[Y:%.*]], <i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = and <3 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <3 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <3 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <3 x i32> zeroinitializer
   %a = and <3 x i32> %y, <i32 42, i32 42, i32 42>
   %r = and <3 x i32> %splatx, %a
   ret <3 x i32> %r
@@ -1749,12 +1749,12 @@ define <3 x i32> @splat_assoc_and(<4 x i32> %x, <3 x i32> %y) {
 
 define <5 x i32> @splat_assoc_xor(<4 x i32> %x, <5 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_xor(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <5 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <5 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <5 x i32> [[SPLATX]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = xor <5 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    ret <5 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <5 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <5 x i32> zeroinitializer
   %a = xor <5 x i32> %y, <i32 42, i32 42, i32 42, i32 42, i32 42>
   %r = xor <5 x i32> %splatx, %a
   ret <5 x i32> %r
@@ -1764,12 +1764,12 @@ define <5 x i32> @splat_assoc_xor(<4 x i32> %x, <5 x i32> %y) {
 
 define <4 x i32> @splat_assoc_add_mul(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @splat_assoc_add_mul(
-; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[Y:%.*]], <i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[R:%.*]] = mul <4 x i32> [[SPLATX]], [[A]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %splatx = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %a = add <4 x i32> %y, <i32 42, i32 42, i32 42, i32 42>
   %r = mul <4 x i32> %splatx, %a
   ret <4 x i32> %r
@@ -1780,11 +1780,11 @@ define <4 x i32> @splat_assoc_add_mul(<4 x i32> %x, <4 x i32> %y) {
 
 define <4 x i32> @PR46872(<4 x i32> %x) {
 ; CHECK-LABEL: @PR46872(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[A:%.*]] = and <4 x i32> [[S]], bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>)
 ; CHECK-NEXT:    ret <4 x i32> [[A]]
 ;
-  %s = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
+  %s = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
   %a = and <4 x i32> %s, bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>)
   ret <4 x i32> %a
 }

diff  --git a/llvm/test/Transforms/InstCombine/vector-concat-binop-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vector-concat-binop-inseltpoison.ll
new file mode 100644
index 000000000000..83a4f7a4a5e5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vector-concat-binop-inseltpoison.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+define <4 x i8> @add(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = and <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Flags should propagate.
+
+define <4 x i8> @sub(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @sub(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = sub nsw <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Flags should propagate.
+
+define <4 x i8> @mul(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = mul nuw <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Undef in shuffle mask does not necessarily propagate.
+
+define <4 x i8> @and(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @and(
+; CHECK-NEXT:    [[CONCAT1:%.*]] = shufflevector <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]], <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[CONCAT2:%.*]] = shufflevector <2 x i8> [[C:%.*]], <2 x i8> [[D:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = and <4 x i8> [[CONCAT1]], [[CONCAT2]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = and <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Undef in shuffle mask does not necessarily propagate.
+
+define <4 x i8> @or(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @or(
+; CHECK-NEXT:    [[CONCAT1:%.*]] = shufflevector <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[CONCAT2:%.*]] = shufflevector <2 x i8> [[C:%.*]], <2 x i8> [[D:%.*]], <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = or <4 x i8> [[CONCAT1]], [[CONCAT2]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+  %r = or <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Undefs in shuffle mask do not necessarily propagate.
+
+define <4 x i8> @xor(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @xor(
+; CHECK-NEXT:    [[CONCAT1:%.*]] = shufflevector <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]], <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[CONCAT2:%.*]] = shufflevector <2 x i8> [[C:%.*]], <2 x i8> [[D:%.*]], <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = xor <4 x i8> [[CONCAT1]], [[CONCAT2]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %r = xor <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @shl(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %r = shl nuw <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @lshr(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr exact <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+  %r = lshr exact <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; Extra-uses prevent the transform.
+declare void @use(<4 x i8>)
+
+define <4 x i8> @ashr(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @ashr(
+; CHECK-NEXT:    [[CONCAT1:%.*]] = shufflevector <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @use(<4 x i8> [[CONCAT1]])
+; CHECK-NEXT:    [[CONCAT2:%.*]] = shufflevector <2 x i8> [[C:%.*]], <2 x i8> [[D:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = ashr <4 x i8> [[CONCAT1]], [[CONCAT2]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @use(<4 x i8> %concat1)
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = ashr <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; TODO: Div/rem with undef in any element in the divisor is undef, so this should be simplified away?
+
+define <4 x i8> @sdiv(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @sdiv(
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv exact <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv exact <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %r = sdiv exact <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @srem(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @srem(
+; CHECK-NEXT:    [[TMP1:%.*]] = srem <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = srem <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = srem <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @udiv(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @udiv(
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv exact <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv exact <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = udiv exact <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+; TODO: Div/rem with undef in any element in the divisor is undef, so this should be simplified away?
+
+define <4 x i8> @urem(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; CHECK-LABEL: @urem(
+; CHECK-NEXT:    [[TMP1:%.*]] = urem <2 x i8> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = urem <2 x i8> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> [[TMP2]], <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %concat1 = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x i8> %c, <2 x i8> %d, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %r = urem <4 x i8> %concat1, %concat2
+  ret <4 x i8> %r
+}
+
+define <4 x float> @fadd(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %concat1 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %concat1, %concat2
+  ret <4 x float> %r
+}
+
+; Fast-math-flags propagate.
+
+define <4 x float> @fsub(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @fsub(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x float> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %concat1 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %concat2 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 undef, i32 3>
+  %r = fsub fast <4 x float> %concat1, %concat2
+  ret <4 x float> %r
+}
+
+; Extra-uses prevent the transform.
+declare void @use2(<4 x float>)
+
+define <4 x float> @fmul(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @fmul(
+; CHECK-NEXT:    [[CONCAT1:%.*]] = shufflevector <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    [[CONCAT2:%.*]] = shufflevector <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+; CHECK-NEXT:    call void @use2(<4 x float> [[CONCAT2]])
+; CHECK-NEXT:    [[R:%.*]] = fmul nnan <4 x float> [[CONCAT1]], [[CONCAT2]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %concat1 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+  %concat2 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+  call void @use2(<4 x float> %concat2)
+  %r = fmul nnan <4 x float> %concat1, %concat2
+  ret <4 x float> %r
+}
+
+; Fast-math-flags propagate.
+
+define <4 x float> @fdiv(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @fdiv(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv ninf arcp <2 x float> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv ninf arcp <2 x float> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %concat1 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %concat2 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r = fdiv ninf arcp <4 x float> %concat1, %concat2
+  ret <4 x float> %r
+}
+
+define <4 x float> @frem(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @frem(
+; CHECK-NEXT:    [[TMP1:%.*]] = frem <2 x float> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <2 x float> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %concat1 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+  %concat2 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+  %r = frem <4 x float> %concat1, %concat2
+  ret <4 x float> %r
+}
+
+; https://bugs.llvm.org/show_bug.cgi?id=33026 - all of the shuffles can be eliminated.
+
+define <4 x i32> @PR33026(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; CHECK-LABEL: @PR33026(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[B:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[SUB]]
+;
+  %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %and = and <8 x i32> %concat1, %concat2
+  %extract1 = shufflevector <8 x i32> %and, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract2 = shufflevector <8 x i32> %and, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sub = sub <4 x i32> %extract1, %extract2
+  ret <4 x i32> %sub
+}

diff  --git a/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
index 29c4471183d3..0141112d8834 100644
--- a/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
@@ -64,10 +64,10 @@ define <2 x i32*> @test7(<2 x {i32, i32}*> %a) {
 
 define <vscale x 2 x i1> @test8() {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    ret <vscale x 2 x i1> icmp ult (<vscale x 2 x i64> zext (<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer) to <vscale x 2 x i64>), <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i1> icmp ult (<vscale x 2 x i64> zext (<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer) to <vscale x 2 x i64>), <vscale x 2 x i64> zeroinitializer)
 ;
   %ins = insertelement <vscale x 2 x i32> poison, i32 1, i32 0
-  %b = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+  %b = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %c = inttoptr <vscale x 2 x i32> %b to <vscale x 2 x i8*>
   %d = icmp ult <vscale x 2 x i8*> %c, zeroinitializer
   ret <vscale x 2 x i1> %d

diff  --git a/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
index 43dc22c20cd2..de04e4562d1a 100644
--- a/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
@@ -59,12 +59,12 @@ define i8 @extractelement_bitcast_wrong_insert(<vscale x 2 x i32> %a, i32 %x) {
 define i32 @extractelement_shuffle_in_range(i32 %v) {
 ; CHECK-LABEL: @extractelement_shuffle_in_range(
 ; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %r = extractelement <vscale x 4 x i32> %splat, i32 1
   ret i32 %r
 }
@@ -72,12 +72,12 @@ define i32 @extractelement_shuffle_in_range(i32 %v) {
 define i32 @extractelement_shuffle_maybe_out_of_range(i32 %v) {
 ; CHECK-LABEL: @extractelement_shuffle_maybe_out_of_range(
 ; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %r = extractelement <vscale x 4 x i32> %splat, i32 4
   ret i32 %r
 }
@@ -85,12 +85,12 @@ define i32 @extractelement_shuffle_maybe_out_of_range(i32 %v) {
 define i32 @extractelement_shuffle_invalid_index(i32 %v) {
 ; CHECK-LABEL: @extractelement_shuffle_invalid_index(
 ; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 -1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %r = extractelement <vscale x 4 x i32> %splat, i32 -1
   ret i32 %r
 }
@@ -99,12 +99,12 @@ define i32 @extractelement_shuffle_invalid_index(i32 %v) {
 define i32 @extractelement_shuffle_symbolic_index(i32 %v, i32 %idx) {
 ; CHECK-LABEL: @extractelement_shuffle_symbolic_index(
 ; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 [[IDX:%.*]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %r = extractelement <vscale x 4 x i32> %splat, i32 %idx
   ret i32 %r
 }

diff  --git a/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
index 252483664f9b..35a139fc8779 100644
--- a/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
@@ -89,13 +89,13 @@ define <vscale x 4 x float> @insertelement_sequene_may_not_be_splat(float %x) {
 define void @ossfuzz_27416(i32 %v) {
 ; CHECK-LABEL: @ossfuzz_27416(
 ; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
-; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <vscale x 4 x i32> [[SPLAT]], i32 undef, i8 -128
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[I1]], <vscale x 4 x i32>* undef, align 16
 ; CHECK-NEXT:    ret void
 ;
   %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
-  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %I1 = insertelement <vscale x 4 x i32> %splat, i32 undef, i8 -128
   store <vscale x 4 x i32> %I1, <vscale x 4 x i32>* undef, align 16
   ret void

diff  --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
new file mode 100644
index 000000000000..6ce03dd2e0f0
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S -o - | FileCheck %s
+
+; When both operands are undef in a lane, that lane should produce an undef result.
+
+define <3 x i8> @shl() {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    ret <3 x i8> <i8 poison, i8 0, i8 0>
+;
+  %c = shl <3 x i8> undef, <i8 undef, i8 4, i8 1>
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @and() {
+; CHECK-LABEL: @and(
+; CHECK-NEXT:    ret <3 x i8> <i8 undef, i8 0, i8 undef>
+;
+  %c = and <3 x i8> <i8 undef, i8 42, i8 undef>, undef
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @and_commute() {
+; CHECK-LABEL: @and_commute(
+; CHECK-NEXT:    ret <3 x i8> <i8 0, i8 0, i8 undef>
+;
+  %c = and <3 x i8> undef, <i8 -42, i8 42, i8 undef>
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @or() {
+; CHECK-LABEL: @or(
+; CHECK-NEXT:    ret <3 x i8> <i8 undef, i8 -1, i8 undef>
+;
+  %c = or <3 x i8> <i8 undef, i8 42, i8 undef>, undef
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @or_commute() {
+; CHECK-LABEL: @or_commute(
+; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 -1, i8 undef>
+;
+  %c = or <3 x i8> undef, <i8 -42, i8 42, i8 undef>
+  ret <3 x i8> %c
+}
+
+define <3 x float> @fadd() {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:    ret <3 x float> <float undef, float 0x7FF8000000000000, float undef>
+;
+  %c = fadd <3 x float> <float undef, float 42.0, float undef>, undef
+  ret <3 x float> %c
+}
+
+define <3 x float> @fadd_commute() {
+; CHECK-LABEL: @fadd_commute(
+; CHECK-NEXT:    ret <3 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float undef>
+;
+  %c = fadd <3 x float> undef, <float -42.0, float 42.0, float undef>
+  ret <3 x float> %c
+}
+
+define <4 x i32> @shuffle_of_undefs(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_of_undefs(
+; CHECK-NEXT:    ret <4 x i32> <i32 poison, i32 undef, i32 undef, i32 undef>
+;
+  %r = shufflevector <4 x i32> undef, <4 x i32> poison, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x i32> %r
+}
+

diff  --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
index e91d943c5eb7..9689887be69b 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
@@ -195,10 +195,10 @@ define <vscale x 4 x i32> @insertelement() {
 
 define <vscale x 4 x i32> @shufflevector() {
 ; CHECK-LABEL: @shufflevector(
-; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
   %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
-  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   ret <vscale x 4 x i32> %i2
 }
 
@@ -221,10 +221,10 @@ define <vscale x 2 x double> @load() {
 
 define <vscale x 4 x float> @bitcast() {
 ; CHECK-LABEL: @bitcast(
-; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
 ;
   %i1 = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
-  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
   ret <vscale x 4 x float> %i3
 }

diff  --git a/llvm/test/Transforms/InstSimplify/shufflevector-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/shufflevector-inseltpoison.ll
new file mode 100644
index 000000000000..74edb9f995ef
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/shufflevector-inseltpoison.ll
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define <4 x i32> @const_folding(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding1(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding1(
+; CHECK-NEXT:    ret <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+;
+  %shuf = shufflevector <4 x i32> <i32 5, i32 4, i32 5, i32 4>, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding_negative(<3 x i32> %x) {
+; CHECK-LABEL: @const_folding_negative(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i32> [[X:%.*]], <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <3 x i32> %x, <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_poison(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_poison(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand1(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand3(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> zeroinitializer, <4 x i32> %splat, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  ret <4 x i32> %shuf
+}
+
+define <8 x i32> @splat_operand_negative(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> poison, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[SPLAT]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %y, <4 x i32> %splat, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative4(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative4(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @undef_mask(<4 x i32> %x) {
+; CHECK-LABEL: @undef_mask(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> poison
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @undef_mask_1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @undef_mask_1(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> poison
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_0(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_0(
+; CHECK-NEXT:    ret <4 x i32> [[X:%.*]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_1(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_1(
+; CHECK-NEXT:    ret <4 x i32> [[X:%.*]]
+;
+  %shuf = shufflevector <4 x i32> undef, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @pseudo_identity_mask(<4 x i32> %x) {
+; CHECK-LABEL: @pseudo_identity_mask(
+; CHECK-NEXT:    ret <4 x i32> [[X:%.*]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @not_identity_mask(<4 x i32> %x) {
+; CHECK-LABEL: @not_identity_mask(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %shuf
+}
+
+; TODO: Should we simplify if the mask has an undef element?
+
+define <4 x i32> @possible_identity_mask(<4 x i32> %x) {
+; CHECK-LABEL: @possible_identity_mask(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_operand(<4 x i32> %x) {
+; CHECK-LABEL: @const_operand(
+; CHECK-NEXT:    ret <4 x i32> <i32 42, i32 45, i32 44, i32 43>
+;
+  %shuf = shufflevector <4 x i32> <i32 42, i32 43, i32 44, i32 45>, <4 x i32> %x, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @merge(<4 x i32> %x) {
+; CHECK-LABEL: @merge(
+; CHECK-NEXT:    ret <4 x i32> [[X:%.*]]
+;
+  %lower = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 1, i32 0>
+  %upper = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %merged = shufflevector <2 x i32> %upper, <2 x i32> %lower, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x i32> %merged
+}
+
+; This crosses lanes from the source op.
+
+define <4 x i32> @not_merge(<4 x i32> %x) {
+; CHECK-LABEL: @not_merge(
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[U:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[MERGED:%.*]] = shufflevector <2 x i32> [[U]], <2 x i32> [[L]], <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[MERGED]]
+;
+  %l = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %u = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %merged = shufflevector <2 x i32> %u, <2 x i32> %l, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x i32> %merged
+}
+
+define <8 x double> @extract_and_concat(<8 x double> %x) {
+; CHECK-LABEL: @extract_and_concat(
+; CHECK-NEXT:    ret <8 x double> [[X:%.*]]
+;
+  %s1 = shufflevector <8 x double> %x, <8 x double> poison, <2 x i32> <i32 0, i32 1>
+  %s2 = shufflevector <8 x double> %x, <8 x double> poison, <2 x i32> <i32 2, i32 3>
+  %s3 = shufflevector <8 x double> %x, <8 x double> poison, <2 x i32> <i32 4, i32 5>
+  %s4 = shufflevector <8 x double> %x, <8 x double> poison, <2 x i32> <i32 6, i32 7>
+  %s5 = shufflevector <2 x double> %s1, <2 x double> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x double> %s3, <2 x double> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x double> %s5, <4 x double> %s6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %s7
+}
+
+; This case has intermediate lane crossings.
+
+define <8 x i64> @PR30630(<8 x i64> %x) {
+; CHECK-LABEL: @PR30630(
+; CHECK-NEXT:    ret <8 x i64> [[X:%.*]]
+;
+  %s1 = shufflevector <8 x i64> %x, <8 x i64> poison, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i64> %x, <8 x i64> poison, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i64> %x, <8 x i64> poison, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i64> %x, <8 x i64> poison, <2 x i32> <i32 3, i32 7>
+  %s5 = shufflevector <2 x i64> %s1, <2 x i64> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x i64> %s3, <2 x i64> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x i64> %s5, <4 x i64> %s6, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x i64> %s7
+}
+
+; This case covers internal canonicalization of shuffles with one constant input vector.
+
+;FIXME: Another issue exposed here, this whole function could be simplified to:
+;         ret <2 x float> zeroinitializer
+define <2 x float> @PR32872(<2 x float> %x) {
+; CHECK-LABEL: @PR32872(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> zeroinitializer, <4 x float> [[TMP1]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x float> [[TMP4]]
+;
+  %tmp1 = shufflevector <2 x float> %x, <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+  %tmp4 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <2 x i32> <i32 4, i32 5>
+  ret <2 x float> %tmp4
+}
+
+define <5 x i8> @splat_inserted_constant(<4 x i8> %x) {
+; CHECK-LABEL: @splat_inserted_constant(
+; CHECK-NEXT:    ret <5 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42>
+;
+  %ins3 = insertelement <4 x i8> %x, i8 42, i64 3
+  %splat5 = shufflevector <4 x i8> %ins3, <4 x i8> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <5 x i8> %splat5
+}
+
+define <4 x float> @splat_inserted_constant_undef_elt(<4 x float> %x) {
+; CHECK-LABEL: @splat_inserted_constant_undef_elt(
+; CHECK-NEXT:    ret <4 x float> <float 1.200000e+01, float 1.200000e+01, float undef, float 1.200000e+01>
+;
+  %ins1 = insertelement <4 x float> %x, float 12.0, i32 1
+  %splat1 = shufflevector <4 x float> %ins1, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 undef, i32 1>
+  ret <4 x float> %splat1
+}
+
+define <2 x i8> @splat_inserted_constant_not_canonical(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @splat_inserted_constant_not_canonical(
+; CHECK-NEXT:    ret <2 x i8> <i8 undef, i8 23>
+;
+  %ins2 = insertelement <3 x i8> %x, i8 23, i7 2
+  %splat2 = shufflevector <3 x i8> %y, <3 x i8> %ins2, <2 x i32> <i32 undef, i32 5>
+  ret <2 x i8> %splat2
+}

diff  --git a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
index dac1ae37f7d8..ba448bc81eee 100644
--- a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
@@ -61,10 +61,10 @@ define <vscale x 4 x i32> @insertelement_inline_to_ret() {
 
 define <vscale x 4 x i32> @insertelement_shufflevector_inline_to_ret() {
 ; CHECK-LABEL: @insertelement_shufflevector_inline_to_ret(
-; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
   %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
-  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   ret <vscale x 4 x i32> %i2
 }
 
@@ -140,10 +140,10 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
 
 define <vscale x 4 x float> @bitcast() {
 ; CHECK-LABEL: @bitcast(
-; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
 ;
   %i1 = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
-  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
   ret <vscale x 4 x float> %i3
 }

diff  --git a/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles-inseltpoison.ll
new file mode 100644
index 000000000000..47327b81c2c6
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles-inseltpoison.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -interleaved-access -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define <4 x float> @vld2(<8 x float>* %pSrc) {
+; CHECK-LABEL: @vld2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[L26:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[L43:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[L6:%.*]] = fadd fast <4 x float> [[L43]], [[L26]]
+; CHECK-NEXT:    ret <4 x float> [[L6]]
+;
+entry:
+  %wide.vec = load <8 x float>, <8 x float>* %pSrc, align 4
+  %l2 = fmul fast <8 x float> %wide.vec, %wide.vec
+  %l3 = shufflevector <8 x float> %l2, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %l4 = fmul fast <8 x float> %wide.vec, %wide.vec
+  %l5 = shufflevector <8 x float> %l4, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %l6 = fadd fast <4 x float> %l5, %l3
+  ret <4 x float> %l6
+}
+
+define <4 x float> @vld3(<12 x float>* %pSrc) {
+; CHECK-LABEL: @vld3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <12 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[L29:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[L46:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[L6:%.*]] = fadd fast <4 x float> [[L46]], [[L29]]
+; CHECK-NEXT:    [[L73:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L73]]
+; CHECK-NEXT:    ret <4 x float> [[L9]]
+;
+entry:
+  %wide.vec = load <12 x float>, <12 x float>* %pSrc, align 4
+  %l2 = fmul fast <12 x float> %wide.vec, %wide.vec
+  %l3 = shufflevector <12 x float> %l2, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %l4 = fmul fast <12 x float> %wide.vec, %wide.vec
+  %l5 = shufflevector <12 x float> %l4, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %l6 = fadd fast <4 x float> %l5, %l3
+  %l7 = fmul fast <12 x float> %wide.vec, %wide.vec
+  %l8 = shufflevector <12 x float> %l7, <12 x float> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %l9 = fadd fast <4 x float> %l6, %l8
+  ret <4 x float> %l9
+}
+
+define <4 x float> @vld4(<16 x float>* %pSrc) {
+; CHECK-LABEL: @vld4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[L312:%.*]] = fmul <4 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[L59:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[L7:%.*]] = fadd fast <4 x float> [[L59]], [[L312]]
+; CHECK-NEXT:    [[L86:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[L103:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[L12:%.*]] = fadd fast <4 x float> [[L103]], [[L86]]
+; CHECK-NEXT:    ret <4 x float> [[L12]]
+;
+entry:
+  %wide.vec = load <16 x float>, <16 x float>* %pSrc, align 4
+  %l3 = fmul fast <16 x float> %wide.vec, %wide.vec
+  %l4 = shufflevector <16 x float> %l3, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %l5 = fmul fast <16 x float> %wide.vec, %wide.vec
+  %l6 = shufflevector <16 x float> %l5, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %l7 = fadd fast <4 x float> %l6, %l4
+  %l8 = fmul fast <16 x float> %wide.vec, %wide.vec
+  %l9 = shufflevector <16 x float> %l8, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %l10 = fmul fast <16 x float> %wide.vec, %wide.vec
+  %l11 = shufflevector <16 x float> %l10, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %l12 = fadd fast <4 x float> %l11, %l9
+  ret <4 x float> %l12
+}
+
+define <4 x float> @twosrc(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
+; CHECK-LABEL: @twosrc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN7:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 1
+; CHECK-NEXT:    [[L46:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[L63:%.*]] = fmul <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[L8:%.*]] = fadd fast <4 x float> [[L63]], [[L46]]
+; CHECK-NEXT:    ret <4 x float> [[L8]]
+;
+entry:
+  %wide.vec = load <8 x float>, <8 x float>* %pSrc1, align 4
+  %wide.vec26 = load <8 x float>, <8 x float>* %pSrc2, align 4
+  %l4 = fmul fast <8 x float> %wide.vec26, %wide.vec
+  %l5 = shufflevector <8 x float> %l4, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %l6 = fmul fast <8 x float> %wide.vec26, %wide.vec
+  %l7 = shufflevector <8 x float> %l6, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %l8 = fadd fast <4 x float> %l7, %l5
+  ret <4 x float> %l8
+}
+
+define <4 x float> @twosrc2(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
+; CHECK-LABEL: @twosrc2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[LDN4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 1
+; CHECK-NEXT:    [[L43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[L6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L43]]
+; CHECK-NEXT:    ret <4 x float> [[L8]]
+;
+entry:
+  %wide.vec = load <8 x float>, <8 x float>* %pSrc1, align 4
+  %wide.vec26 = load <8 x float>, <8 x float>* %pSrc2, align 4
+  %l4 = fmul fast <8 x float> %wide.vec26, %wide.vec
+  %l5 = shufflevector <8 x float> %l4, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %s1 = shufflevector <8 x float> %wide.vec26, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %s2 = shufflevector <8 x float> %wide.vec, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %l6 = fmul fast <4 x float> %s1, %s2
+  %l8 = fadd fast <4 x float> %l6, %l5
+  ret <4 x float> %l8
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-extract-user-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-extract-user-inseltpoison.ll
new file mode 100644
index 000000000000..a2acc162a646
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-extract-user-inseltpoison.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -interleaved-access -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define void @extract_user_basic(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    br i1 %c, label %if.then, label %if.merge
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+; CHECK-NEXT:    br label %if.merge
+; CHECK:       if.merge:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 2
+  br label %if.merge
+
+if.merge:
+  ret void
+}
+
+define void @extract_user_multi(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
+; CHECK-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    br i1 %c, label %if.then, label %if.merge
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    br label %if.merge
+; CHECK:       if.merge:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 0
+  br label %if.merge
+
+if.merge:
+  %e1 = extractelement <8 x i32> %interleaved.vec, i32 2
+  ret void
+}
+
+define void @extract_user_multi_no_dom(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT:     @llvm.aarch64.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 0
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e1 = extractelement <8 x i32> %interleaved.vec, i32 2
+  br label %if.merge
+
+if.merge:
+  ret void
+}
+
+define void @extract_user_wrong_const_index(<8 x i32>* %ptr) {
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT:     @llvm.aarch64.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 1
+  ret void
+}
+
+define void @extract_user_undef_index(<8 x i32>* %ptr) {
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT:     @llvm.aarch64.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 undef
+  ret void
+}
+
+define void @extract_user_var_index(<8 x i32>* %ptr, i32 %i) {
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT:     @llvm.aarch64.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 %i
+  ret void
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll
new file mode 100644
index 000000000000..ee47da779386
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll
@@ -0,0 +1,801 @@
+; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON
+; RUN: opt < %s -mattr=-neon -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define void @load_factor2(<16 x i8>* %ptr) {
+; NEON-LABEL:    @load_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 1
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor3(<12 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  ret void
+}
+
+define void @load_factor4(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor4(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  ret void
+}
+
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+; NEON-LABEL:    @store_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP3:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v8i8.p0v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8>* [[TMP3]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; NEON-LABEL:    @store_factor3(
+; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; NEON-LABEL:    @store_factor4(
+; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
+; NEON-LABEL:    @load_ptrvec_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_ptrvec_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
+  %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> poison, <2 x i32> <i32 1, i32 3>
+  ret void
+}
+
+define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
+; NEON-LABEL:    @load_ptrvec_factor3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
+; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_ptrvec_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
+  %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 2, i32 5>
+  ret void
+}
+
+define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
+; NEON-LABEL:    @load_ptrvec_factor4(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 3
+; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = inttoptr <2 x i64> [[TMP8]] to <2 x i32*>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_ptrvec_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
+  %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 3, i32 7>
+  ret void
+}
+
+define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+; NEON-LABEL:    @store_ptrvec_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i64>
+; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i64>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_ptrvec_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+; NEON-LABEL:    @store_ptrvec_factor3(
+; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
+; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_ptrvec_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+; NEON-LABEL:    @store_ptrvec_factor4(
+; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
+; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 6, i32 7>
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_ptrvec_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
+; NEON-LABEL:    @load_undef_mask_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_undef_mask_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  ret void
+}
+
+define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
+; NEON-LABEL:    @load_undef_mask_factor3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_undef_mask_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  ret void
+}
+
+define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_undef_mask_factor4(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
+; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_undef_mask_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+  ret void
+}
+
+define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; NEON-LABEL:    @store_undef_mask_factor2(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP3:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_undef_mask_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; NEON-LABEL:    @store_undef_mask_factor3(
+; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_undef_mask_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; NEON-LABEL:    @store_undef_mask_factor4(
+; NEON:            [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_undef_mask_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
+; NEON-LABEL:    @load_illegal_factor2(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @load_illegal_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
+  %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret void
+}
+
+define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
+; NEON-LABEL:    @store_illegal_factor2(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @store_illegal_factor2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
+  ret void
+}
+
+define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor4(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor4(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor4_undefbeg(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor4_undefend(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor4_undefmid(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor4_undefmulti(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor3(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor3_undefmultimid(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor3_undef_fail(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor3_undeflane(
+; NEON-NEXT:       [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_negativestart(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; NEON-LABEL:    @store_general_mask_factor3_negativestart(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @store_general_mask_factor3_negativestart(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+ at g = external global <4 x float>
+
+; The following does not give a valid interleaved store
+; NEON-LABEL: define void @no_interleave
+; NEON-NOT: call void @llvm.aarch64.neon.st2
+; NEON: shufflevector
+; NEON: store
+; NEON: ret void
+; NO_NEON-LABEL: define void @no_interleave
+; NO_NEON: shufflevector
+; NO_NEON: store
+; NO_NEON: ret void
+define void @no_interleave(<4 x float> %a0) {
+  %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 3, i32 7, i32 undef>
+  store <4 x float> %v0, <4 x float>* @g, align 16
+  ret void
+}
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; NEON-LABEL: @load_factor3_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; NEON-LABEL: @load_factor4_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; NEON-LABEL:    @store_factor2_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2_wide(
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; NEON-LABEL:    @store_factor3_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3_wide(
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; NEON-LABEL:    @store_factor4_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; NEON-NEXT:       [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; NEON-LABEL:    @load_factor2_fp128(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @load_factor2_fp128(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
+  ret void
+}
+
+define <4 x i1> @load_large_vector(<12 x i64 *>* %p) {
+; NEON-LABEL:    @load_large_vector(
+; NEON:            [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
+; NEON-NEXT:       [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:       [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x i64*>
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:       [[TMP4:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x i64*>
+; NEON:            [[LDN1:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = inttoptr <2 x i64> [[TMP5]] to <2 x i64*>
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP8:%.*]] = inttoptr <2 x i64> [[TMP7]] to <2 x i64*>
+; NEON-NEXT:       shufflevector <2 x i64*> [[TMP2]], <2 x i64*> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       shufflevector <2 x i64*> [[TMP4]], <2 x i64*> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NO_NEON-LABEL: @load_large_vector(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret
+;
+  %l = load <12 x i64 *>, <12 x i64 *>* %p
+  %s1 = shufflevector <12 x i64 *> %l, <12 x i64 *> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %s2 = shufflevector <12 x i64 *> %l, <12 x i64 *> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %ret = icmp ne <4 x i64 *> %s1, %s2
+  ret <4 x i1> %ret
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-extract-user-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-extract-user-inseltpoison.ll
new file mode 100644
index 000000000000..97bbc0ce6f99
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-extract-user-inseltpoison.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "arm---eabi"
+
+define void @extract_user_basic(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i32>* %ptr to i8*
+; CHECK-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP0]], i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEXT:    br i1 %c, label %if.then, label %if.merge
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+; CHECK-NEXT:    br label %if.merge
+; CHECK:       if.merge:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 2
+  br label %if.merge
+
+if.merge:
+  ret void
+}
+
+define void @extract_user_multi(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i32>* %ptr to i8*
+; CHECK-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP0]], i32 8)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEXT:    br i1 %c, label %if.then, label %if.merge
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    br label %if.merge
+; CHECK:       if.merge:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 0
+  br label %if.merge
+
+if.merge:
+  %e1 = extractelement <8 x i32> %interleaved.vec, i32 2
+  ret void
+}
+
+define void @extract_user_multi_no_dom(<8 x i32>* %ptr, i1 %c) {
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT:     @llvm.arm.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 0
+  br i1 %c, label %if.then, label %if.merge
+
+if.then:
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e1 = extractelement <8 x i32> %interleaved.vec, i32 2
+  br label %if.merge
+
+if.merge:
+  ret void
+}
+
+define void @extract_user_wrong_const_index(<8 x i32>* %ptr) {
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT:     @llvm.arm.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 1
+  ret void
+}
+
+define void @extract_user_undef_index(<8 x i32>* %ptr) {
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT:     @llvm.arm.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 undef
+  ret void
+}
+
+define void @extract_user_var_index(<8 x i32>* %ptr, i32 %i) {
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT:     @llvm.arm.neon
+; CHECK:         ret void
+;
+entry:
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 8
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %e0 = extractelement <8 x i32> %interleaved.vec, i32 %i
+  ret void
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-inseltpoison.ll
new file mode 100644
index 000000000000..05d228d933c1
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses-inseltpoison.ll
@@ -0,0 +1,1432 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -mattr=+mve.fp -interleaved-access -S | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NONE
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "arm---eabi"
+
+define void @load_factor2(<16 x i8>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor3(<6 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <6 x i32>, <6 x i32>* %ptr, align 4
+  %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+  ret void
+}
+
+define void @load_factor4(<16 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  ret void
+}
+
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-NEON-LABEL: @store_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP3]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-NEON-LABEL: @store_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-NEON-LABEL: @store_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
+; CHECK-NEON-LABEL: @load_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
+  %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> poison, <2 x i32> <i32 0, i32 2>
+  ret void
+}
+
+define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
+; CHECK-NEON-LABEL: @load_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
+  %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> poison, <2 x i32> <i32 2, i32 5>
+  ret void
+}
+
+define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
+; CHECK-NEON-LABEL: @load_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 0, i32 4>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 1, i32 5>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 2, i32 6>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 3, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 0, i32 4>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 1, i32 5>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 2, i32 6>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> poison, <2 x i32> <i32 3, i32 7>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
+  %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <2 x i32> <i32 3, i32 7>
+  ret void
+}
+
+define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+; CHECK-NEON-LABEL: @store_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x i32*> [[V0:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x i32*> [[V1:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+; CHECK-NEON-LABEL: @store_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP6]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-MVE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NONE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+; CHECK-NEON-LABEL: @store_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP7]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
+  ret void
+}
+
+define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]])
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  ret void
+}
+
+define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  ret void
+}
+
+define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+  ret void
+}
+
+define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 0)
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP3]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 1)
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-NEON-LABEL: @store_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-NEON-LABEL: @store_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_address_space(<8 x i32> addrspace(1)* %ptr) {
+; CHECK-NEON-LABEL: @load_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 32)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]], align 32
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]], align 32
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <2 x i32> <i32 2, i32 5>
+  ret void
+}
+
+define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP3]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 8)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]], align 8
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]], align 8
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %interleaved.vec, <4 x i32> addrspace(1)* %ptr
+  ret void
+}
+
+define void @load_f16_factor2(<8 x half>* %ptr) {
+; CHECK-NEON-LABEL: @load_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
+  %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x half> %interleaved.vec, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret void
+}
+
+define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
+; CHECK-NEON-LABEL: @store_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEON-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-MVE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
+  ret void
+}
+
+define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
+; CHECK-NEON-LABEL: @load_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
+  %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret void
+}
+
+define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
+; CHECK-NEON-LABEL: @store_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
+  ret void
+}
+
+define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP5]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
+  ret void
+}
+
+ at g = external global <4 x float>
+
+; The following does not give a valid interleaved store
+define void @no_interleave(<4 x float> %a0) {
+; CHECK-NEON-LABEL: @no_interleave(
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NEON-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @no_interleave(
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-MVE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @no_interleave(
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NONE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NONE-NEXT:    ret void
+;
+  %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+  store <4 x float> %v0, <4 x float>* @g, align 16
+  ret void
+}
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor2_wide2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide2(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]])
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-MVE-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-MVE-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]])
+; CHECK-MVE-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-MVE-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-MVE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor2_wide3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide3(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]])
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-MVE-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-MVE-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP4]])
+; CHECK-MVE-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-MVE-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-MVE-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP4]], i32 8
+; CHECK-MVE-NEXT:    [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP7]])
+; CHECK-MVE-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; CHECK-MVE-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; CHECK-MVE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-MVE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor3_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor4_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; CHECK-NEON-LABEL: @store_factor2_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2_wide(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 0)
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 1)
+; CHECK-MVE-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-MVE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 0)
+; CHECK-MVE-NEXT:    call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 1)
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; CHECK-NEON-LABEL: @store_factor3_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP5]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-MVE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NONE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; CHECK-NEON-LABEL: @store_factor4_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP12]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-MVE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NONE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor2_fp128(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_fp128(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_fp128(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
+  ret void
+}
+
+define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
+; CHECK-NEON-LABEL: @load_factor2_wide_pointer(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide_pointer(
+; CHECK-MVE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32*
+; CHECK-MVE-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP1]])
+; CHECK-MVE-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-MVE-NEXT:    [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x i32*>
+; CHECK-MVE-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-MVE-NEXT:    [[TMP5:%.*]] = inttoptr <4 x i32> [[TMP4]] to <4 x i32*>
+; CHECK-MVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-MVE-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[TMP6]])
+; CHECK-MVE-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-MVE-NEXT:    [[TMP8:%.*]] = inttoptr <4 x i32> [[TMP7]] to <4 x i32*>
+; CHECK-MVE-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-MVE-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
+; CHECK-MVE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32*> [[TMP5]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide_pointer(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
+  %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+; This would be a candidate for interleaving, except that load doesn't
+; actually load enough elements to satisfy the shuffle masks. (It would be
+; possible to produce a vld2.v2i32, but that currently isn't implemented.)
+define void @load_out_of_range(<4 x i32>* %ptr) {
+; CHECK-NEON-LABEL: @load_out_of_range(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_out_of_range(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_out_of_range(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
+  %interleaved.vec = load <4 x i32>, <4 x i32>* %ptr, align 4
+  %v0 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  ret void
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx-inseltpoison.ll
new file mode 100644
index 000000000000..892a9e81a386
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx-inseltpoison.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-pc-linux  -mattr=+avx -interleaved-access -S | FileCheck %s
+
+; This file tests the function `llvm::lowerInterleavedLoad/Store`.
+
+define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
+; CHECK-LABEL: @load_factorf64_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP9]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP11]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP13]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP11]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP13]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd <4 x double> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd <4 x double> [[ADD1]], [[TMP15]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd <4 x double> [[ADD2]], [[TMP17]]
+; CHECK-NEXT:    ret <4 x double> [[ADD3]]
+;
+  %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
+  %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %add1 = fadd <4 x double> %strided.v0, %strided.v1
+  %add2 = fadd <4 x double> %add1, %strided.v2
+  %add3 = fadd <4 x double> %add2, %strided.v3
+  ret <4 x double> %add3
+}
+
+define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
+; CHECK-LABEL: @load_factori64_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i64>* [[PTR:%.*]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, <4 x i64>* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP7]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP9]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> [[TMP11]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> [[TMP11]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[ADD1:%.*]] = add <4 x i64> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <4 x i64> [[ADD1]], [[TMP15]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add <4 x i64> [[ADD2]], [[TMP17]]
+; CHECK-NEXT:    ret <4 x i64> [[ADD3]]
+;
+  %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
+  %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %add1 = add <4 x i64> %strided.v0, %strided.v1
+  %add2 = add <4 x i64> %add1, %strided.v2
+  %add3 = add <4 x i64> %add2, %strided.v3
+  ret <4 x i64> %add3
+}
+
+define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
+; CHECK-LABEL: @load_factorf64_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP7]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP9]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP11]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP13]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP11]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP13]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x double> [[TMP14]], [[TMP14]]
+; CHECK-NEXT:    ret <4 x double> [[MUL]]
+;
+  %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
+  %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %mul = fmul <4 x double> %strided.v0, %strided.v3
+  ret <4 x double> %mul
+}
+
+define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x double> [[TMP15]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
+
+define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
+; CHECK-LABEL: @store_factori64_4(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i64> [[TMP13]], <8 x i64> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[TMP15]], <16 x i64>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
+  ret void
+}
+
+define void @store_factorf64_4_revMask(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4_revMask(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x double> [[TMP15]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 12, i32 8, i32 4, i32 0, i32 13, i32 9, i32 5, i32 1, i32 14, i32 10, i32 6, i32 2, i32 15, i32 11, i32 7, i32 3>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
+
+define void @store_factorf64_4_arbitraryMask(<16 x double>* %ptr, <16 x double> %v0, <16 x double> %v1, <16 x double> %v2, <16 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4_arbitraryMask(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <16 x double> [[V0:%.*]], <16 x double> [[V1:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x double> [[V2:%.*]], <16 x double> [[V3:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x double> [[S0]], <32 x double> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x double> [[S0]], <32 x double> [[S1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x double> [[S0]], <32 x double> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x double> [[S0]], <32 x double> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x double> [[TMP15]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <16 x double> %v0, <16 x double> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s1 = shufflevector <16 x double> %v2, <16 x double> %v3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %interleaved.vec = shufflevector <32 x double> %s0, <32 x double> %s1, <16 x i32> <i32 4, i32 32, i32 16, i32 8, i32 5, i32 33, i32 17, i32 9, i32 6, i32 34, i32 18, i32 10, i32 7, i32 35, i32 19, i32 11>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
+
+; This verifies whether the test passes and does not hit any assertions.
+; Today, X86InterleavedAccess could have handled this case and
+; generate transposed sequence by extending the current implementation
+; which would be creating dummy vectors of undef. But it decided not to
+; optimize these cases where the load-size is less than Factor * NumberOfElements.
+; Because a better sequence can easily be generated by CG.
+
+ at a = local_unnamed_addr global <4 x double> zeroinitializer, align 32
+; Function Attrs: norecurse nounwind readonly uwtable
+define <4 x double> @test_unhandled(<4 x double> %b) {
+; CHECK-LABEL: @test_unhandled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* @a, align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
+entry:
+  %0 = load <4 x double>, <4 x double>* @a, align 32
+  %1 = shufflevector <4 x double> %0, <4 x double> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x double> %1, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad-inseltpoison.ll
new file mode 100644
index 000000000000..f713918292dc
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad-inseltpoison.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw -mattr=+avx512vl -interleaved-access -S | FileCheck %s
+
+define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
+; CHECK-LABEL: @interleaved_load_vf32_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <96 x i8>* [[PTR:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 128
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> [[TMP19]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> [[TMP20]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP21]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP20]], <32 x i8> [[TMP22]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> undef, <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> undef, <32 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25>
+; CHECK-NEXT:    [[ADD1:%.*]] = add <32 x i8> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <32 x i8> [[TMP25]], [[ADD1]]
+; CHECK-NEXT:    ret <32 x i8> [[ADD2]]
+;
+  %wide.vec = load <96 x i8>, <96 x i8>* %ptr
+  %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
+  %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
+  %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
+  %add1 = add <32 x i8> %v1, %v2
+  %add2 = add <32 x i8> %v3, %add1
+  ret <32 x i8> %add2
+}
+
+define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
+; CHECK-LABEL: @interleaved_load_vf16_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <48 x i8>* [[PTR:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP13]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> undef, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> undef, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[ADD1:%.*]] = add <16 x i8> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <16 x i8> [[TMP16]], [[ADD1]]
+; CHECK-NEXT:    ret <16 x i8> [[ADD2]]
+;
+  %wide.vec = load <48 x i8>, <48 x i8>* %ptr
+  %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
+  %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
+  %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
+  %add1 = add <16 x i8> %v1, %v2
+  %add2 = add <16 x i8> %v3, %add1
+  ret <16 x i8> %add2
+}
+
+define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
+; CHECK-LABEL: @interleaved_load_vf8_i8_stride3(
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]], align 32
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:    [[ADD1:%.*]] = add <8 x i8> [[V1]], [[V2]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <8 x i8> [[V3]], [[ADD1]]
+; CHECK-NEXT:    ret <8 x i8> [[ADD2]]
+;
+  %wide.vec = load <24 x i8>, <24 x i8>* %ptr
+  %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison,<8 x i32> <i32 0,i32 3,i32 6,i32  9,i32 12,i32 15,i32 18,i32 21>
+  %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
+  %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
+  %add1 = add <8 x i8> %v1, %v2
+  %add2 = add <8 x i8> %v3, %add1
+  ret <8 x i8> %add2
+}
+
+
+define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
+; CHECK-LABEL: @interleaved_load_vf64_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <192 x i8>* [[PTR:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 9
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 10
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 11
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, <16 x i8>* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP21]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP23]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP25]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <32 x i8> [[TMP26]], <32 x i8> [[TMP29]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <32 x i8> [[TMP27]], <32 x i8> [[TMP30]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <32 x i8> [[TMP28]], <32 x i8> [[TMP31]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <64 x i8> [[TMP32]], <64 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 34, i32 37, i32 40, i32 43, i32 46, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 50, i32 53, i32 56, i32 59, i32 62, i32 49, i32 52, i32 55, i32 58, i32 61>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <64 x i8> [[TMP33]], <64 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 34, i32 37, i32 40, i32 43, i32 46, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 50, i32 53, i32 56, i32 59, i32 62, i32 49, i32 52, i32 55, i32 58, i32 61>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <64 x i8> [[TMP34]], <64 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 34, i32 37, i32 40, i32 43, i32 46, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 50, i32 53, i32 56, i32 59, i32 62, i32 49, i32 52, i32 55, i32 58, i32 61>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <64 x i8> [[TMP37]], <64 x i8> [[TMP35]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <64 x i8> [[TMP35]], <64 x i8> [[TMP36]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <64 x i8> [[TMP36]], <64 x i8> [[TMP37]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <64 x i8> [[TMP39]], <64 x i8> [[TMP38]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <64 x i8> [[TMP40]], <64 x i8> [[TMP39]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <64 x i8> [[TMP38]], <64 x i8> [[TMP40]], <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <64 x i8> [[TMP42]], <64 x i8> undef, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 32, i32 33, i32 34, i32 35, i32 36, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <64 x i8> [[TMP41]], <64 x i8> undef, <64 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57>
+; CHECK-NEXT:    [[ADD1:%.*]] = add <64 x i8> [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <64 x i8> [[TMP43]], [[ADD1]]
+; CHECK-NEXT:    ret <64 x i8> [[ADD2]]
+;
+  %wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
+  %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
+  %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
+  %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
+  %add1 = add <64 x i8> %v1, %v2
+  %add2 = add <64 x i8> %v3, %add1
+  ret <64 x i8> %add2
+}

diff  --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore-inseltpoison.ll
new file mode 100644
index 000000000000..a32c125ec7d1
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore-inseltpoison.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S | FileCheck %s
+
+define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf32_i8_stride4(
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <32 x i8> [[X3:%.*]], <32 x i8> [[X4:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[V1]], <64 x i8> [[V2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i8> [[V1]], <64 x i8> [[V2]], <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[V1]], <64 x i8> [[V2]], <32 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <64 x i8> [[V1]], <64 x i8> [[V2]], <32 x i32> <i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> [[TMP4]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> [[TMP4]], <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> [[TMP7]], <32 x i32> <i32 0, i32 1, i32 32, i32 33, i32 2, i32 3, i32 34, i32 35, i32 4, i32 5, i32 36, i32 37, i32 6, i32 7, i32 38, i32 39, i32 16, i32 17, i32 48, i32 49, i32 18, i32 19, i32 50, i32 51, i32 20, i32 21, i32 52, i32 53, i32 22, i32 23, i32 54, i32 55>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> [[TMP7]], <32 x i32> <i32 8, i32 9, i32 40, i32 41, i32 10, i32 11, i32 42, i32 43, i32 12, i32 13, i32 44, i32 45, i32 14, i32 15, i32 46, i32 47, i32 24, i32 25, i32 56, i32 57, i32 26, i32 27, i32 58, i32 59, i32 28, i32 29, i32 60, i32 61, i32 30, i32 31, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 32, i32 33, i32 2, i32 3, i32 34, i32 35, i32 4, i32 5, i32 36, i32 37, i32 6, i32 7, i32 38, i32 39, i32 16, i32 17, i32 48, i32 49, i32 18, i32 19, i32 50, i32 51, i32 20, i32 21, i32 52, i32 53, i32 22, i32 23, i32 54, i32 55>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> [[TMP8]], <32 x i32> <i32 8, i32 9, i32 40, i32 41, i32 10, i32 11, i32 42, i32 43, i32 12, i32 13, i32 44, i32 45, i32 14, i32 15, i32 46, i32 47, i32 24, i32 25, i32 56, i32 57, i32 26, i32 27, i32 58, i32 59, i32 28, i32 29, i32 60, i32 61, i32 30, i32 31, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> [[TMP10]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP11]], <32 x i8> [[TMP12]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> [[TMP10]], <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP11]], <32 x i8> [[TMP12]], <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> [[TMP16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> [[TMP18]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    store <128 x i8> [[TMP19]], <128 x i8>* [[P:%.*]], align 128
+; CHECK-NEXT:    ret void
+;
+  %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
+  store <128 x i8> %interleaved.vec, <128 x i8>* %p
+  ret void
+}
+
+define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf16_i8_stride4(
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <16 x i8> [[X3:%.*]], <16 x i8> [[X4:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]], align 64
+; CHECK-NEXT:    ret void
+;
+%v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
+store <64 x i8> %interleaved.vec, <64 x i8>* %p
+ret void
+}
+
+define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf8_i8_stride4(
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <8 x i8> [[X1:%.*]], <8 x i8> [[X2:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <8 x i8> [[X3:%.*]], <8 x i8> [[X4:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret void
+;
+  %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
+  store <32 x i8> %interleaved.vec, <32 x i8>* %p
+ret void
+}
+
+define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf8_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i8> [[INTERLEAVED_VEC]], <24 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+%1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%2 = shufflevector <8 x i8> %c, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf16_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> undef, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP5]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP6]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> undef, <16 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> undef, <16 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> undef, <16 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    store <48 x i8> [[TMP19]], <48 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%2 = shufflevector <16 x i8> %c, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf32_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i8> [[C:%.*]], <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> undef, <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> [[TMP5]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> [[TMP6]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> [[TMP7]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> [[TMP9]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> [[TMP10]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP10]], <32 x i8> [[TMP8]], <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP11]], <32 x i8> [[TMP12]], <32 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5, i32 32, i32 43, i32 38, i32 33, i32 44, i32 39, i32 34, i32 45, i32 40, i32 35, i32 46, i32 41, i32 36, i32 47, i32 42, i32 37>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP11]], <32 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5, i32 48, i32 59, i32 54, i32 49, i32 60, i32 55, i32 50, i32 61, i32 56, i32 51, i32 62, i32 57, i32 52, i32 63, i32 58, i32 53>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> [[TMP13]], <32 x i32> <i32 16, i32 27, i32 22, i32 17, i32 28, i32 23, i32 18, i32 29, i32 24, i32 19, i32 30, i32 25, i32 20, i32 31, i32 26, i32 21, i32 48, i32 59, i32 54, i32 49, i32 60, i32 55, i32 50, i32 61, i32 56, i32 51, i32 62, i32 57, i32 52, i32 63, i32 58, i32 53>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> [[TMP18]], <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+; CHECK-NEXT:    store <96 x i8> [[TMP19]], <96 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+%1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+%2 = shufflevector <32 x i8> %c, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
+store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf64_i8_stride3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i8> [[C:%.*]], <64 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> undef, <64 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP4]], <64 x i8> undef, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 43, i32 44, i32 45, i32 46, i32 47, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 59, i32 60, i32 61, i32 62, i32 63, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP6]], <64 x i8> [[TMP5]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP7]], <64 x i8> [[TMP6]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i8> [[TMP5]], <64 x i8> [[TMP7]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <64 x i8> [[TMP8]], <64 x i8> [[TMP9]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i8> [[TMP9]], <64 x i8> [[TMP10]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP10]], <64 x i8> [[TMP8]], <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5, i32 64, i32 75, i32 70, i32 65, i32 76, i32 71, i32 66, i32 77, i32 72, i32 67, i32 78, i32 73, i32 68, i32 79, i32 74, i32 69>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP11]], <32 x i32> <i32 0, i32 11, i32 6, i32 1, i32 12, i32 7, i32 2, i32 13, i32 8, i32 3, i32 14, i32 9, i32 4, i32 15, i32 10, i32 5, i32 80, i32 91, i32 86, i32 81, i32 92, i32 87, i32 82, i32 93, i32 88, i32 83, i32 94, i32 89, i32 84, i32 95, i32 90, i32 85>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> [[TMP13]], <32 x i32> <i32 16, i32 27, i32 22, i32 17, i32 28, i32 23, i32 18, i32 29, i32 24, i32 19, i32 30, i32 25, i32 20, i32 31, i32 26, i32 21, i32 80, i32 91, i32 86, i32 81, i32 92, i32 87, i32 82, i32 93, i32 88, i32 83, i32 94, i32 89, i32 84, i32 95, i32 90, i32 85>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 32, i32 43, i32 38, i32 33, i32 44, i32 39, i32 34, i32 45, i32 40, i32 35, i32 46, i32 41, i32 36, i32 47, i32 42, i32 37, i32 96, i32 107, i32 102, i32 97, i32 108, i32 103, i32 98, i32 109, i32 104, i32 99, i32 110, i32 105, i32 100, i32 111, i32 106, i32 101>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP11]], <32 x i32> <i32 32, i32 43, i32 38, i32 33, i32 44, i32 39, i32 34, i32 45, i32 40, i32 35, i32 46, i32 41, i32 36, i32 47, i32 42, i32 37, i32 112, i32 123, i32 118, i32 113, i32 124, i32 119, i32 114, i32 125, i32 120, i32 115, i32 126, i32 121, i32 116, i32 127, i32 122, i32 117>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP12]], <64 x i8> [[TMP13]], <32 x i32> <i32 48, i32 59, i32 54, i32 49, i32 60, i32 55, i32 50, i32 61, i32 56, i32 51, i32 62, i32 57, i32 52, i32 63, i32 58, i32 53, i32 112, i32 123, i32 118, i32 113, i32 124, i32 119, i32 114, i32 125, i32 120, i32 115, i32 126, i32 121, i32 116, i32 127, i32 122, i32 117>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> [[TMP19]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <64 x i8> [[TMP20]], <64 x i8> [[TMP21]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <64 x i8> [[TMP22]], <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <128 x i8> [[TMP23]], <128 x i8> [[TMP24]], <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
+; CHECK-NEXT:    store <192 x i8> [[TMP25]], <192 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%2 = shufflevector <64 x i8> %c, <64 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
+store <192 x i8> %3, <192 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
+; CHECK-LABEL: @interleaved_store_vf64_i8_stride4(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i8> [[C:%.*]], <64 x i8> [[D:%.*]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <128 x i8> [[TMP1]], <128 x i8> [[TMP2]], <64 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> [[TMP4]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> [[TMP4]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i8> [[TMP5]], <64 x i8> [[TMP6]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i8> [[TMP5]], <64 x i8> [[TMP6]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <64 x i8> [[TMP7]], <64 x i8> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i8> [[TMP7]], <64 x i8> [[TMP9]], <64 x i32> <i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i8> [[TMP8]], <64 x i8> [[TMP10]], <64 x i32> <i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i8> [[TMP8]], <64 x i8> [[TMP10]], <64 x i32> <i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP14]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP14]], <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP14]], <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i8> [[TMP11]], <64 x i8> [[TMP12]], <32 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <64 x i8> [[TMP13]], <64 x i8> [[TMP14]], <32 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> [[TMP16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> [[TMP22]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i8> [[TMP23]], <64 x i8> [[TMP24]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> [[TMP26]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <128 x i8> [[TMP27]], <128 x i8> [[TMP28]], <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
+; CHECK-NEXT:    store <256 x i8> [[TMP29]], <256 x i8>* [[P:%.*]], align 256
+; CHECK-NEXT:    ret void
+;
+%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
+store <256 x i8> %interleaved, <256 x i8>* %p
+ret void
+}

diff  --git a/llvm/test/Transforms/LoopSimplify/do-preheader-dbg-inseltpoison.ll b/llvm/test/Transforms/LoopSimplify/do-preheader-dbg-inseltpoison.ll
new file mode 100755
index 000000000000..6856da1651a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplify/do-preheader-dbg-inseltpoison.ll
@@ -0,0 +1,122 @@
+; Confirm that the line number for the do.body.preheader block
+; branch is the the start of the loop.
+
+; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -loop-simplify -keep-loops="false" -S <%s | FileCheck %s
+
+; CHECK: do.body.preheader:
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: br label %do.body, !dbg ![[DL:[0-9]+]]
+; CHECK: ![[DL]] = !DILocation(line: 4,
+
+; This IR can be generated by running:
+; clang src.cpp -O2 -g -S -emit-llvm -mllvm -opt-bisect-limit=62 -o -
+;
+; Where  src.cpp contains:
+; int foo(char *Bytes, int Count)
+; {
+;     int Total = 0;
+;     do
+;         Total += Bytes[--Count];
+;     while (Count);
+;     return Total;
+; }
+
+define dso_local i32 @"foo"(i8* nocapture readonly %Bytes, i32 %Count) local_unnamed_addr !dbg !8 {
+entry:
+  %0 = sext i32 %Count to i64, !dbg !10
+  %min.iters.check = icmp ult i32 %Count, 8, !dbg !10
+  br i1 %min.iters.check, label %do.body.preheader, label %vector.ph, !dbg !10
+
+vector.ph:                                        ; preds = %entry
+  %n.vec = and i64 %0, -8, !dbg !10
+  %ind.end = sub nsw i64 %0, %n.vec, !dbg !10
+  br label %vector.body, !dbg !10
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %vec.phi5 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+  %1 = xor i64 %index, -1, !dbg !11
+  %2 = add i64 %1, %0, !dbg !11
+  %3 = getelementptr inbounds i8, i8* %Bytes, i64 %2, !dbg !11
+  %4 = getelementptr inbounds i8, i8* %3, i64 -3, !dbg !11
+  %5 = bitcast i8* %4 to <4 x i8>*, !dbg !11
+  %wide.load = load <4 x i8>, <4 x i8>* %5, align 1, !dbg !11, !tbaa !12
+  %reverse = shufflevector <4 x i8> %wide.load, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, !dbg !11
+  %6 = getelementptr inbounds i8, i8* %3, i64 -4, !dbg !11
+  %7 = getelementptr inbounds i8, i8* %6, i64 -3, !dbg !11
+  %8 = bitcast i8* %7 to <4 x i8>*, !dbg !11
+  %wide.load6 = load <4 x i8>, <4 x i8>* %8, align 1, !dbg !11, !tbaa !12
+  %reverse7 = shufflevector <4 x i8> %wide.load6, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, !dbg !11
+  %9 = sext <4 x i8> %reverse to <4 x i32>, !dbg !11
+  %10 = sext <4 x i8> %reverse7 to <4 x i32>, !dbg !11
+  %11 = add nsw <4 x i32> %vec.phi, %9, !dbg !11
+  %12 = add nsw <4 x i32> %vec.phi5, %10, !dbg !11
+  %index.next = add i64 %index, 8
+  %13 = icmp eq i64 %index.next, %n.vec
+  br i1 %13, label %middle.block, label %vector.body, !llvm.loop !15
+
+middle.block:                                     ; preds = %vector.body
+  %.lcssa12 = phi <4 x i32> [ %11, %vector.body ], !dbg !11
+  %.lcssa = phi <4 x i32> [ %12, %vector.body ], !dbg !11
+  %bin.rdx = add <4 x i32> %.lcssa, %.lcssa12, !dbg !11
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>, !dbg !11
+  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf, !dbg !11
+  %rdx.shuf9 = shufflevector <4 x i32> %bin.rdx8, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>, !dbg !11
+  %bin.rdx10 = add <4 x i32> %bin.rdx8, %rdx.shuf9, !dbg !11
+  %14 = extractelement <4 x i32> %bin.rdx10, i32 0, !dbg !11
+  %cmp.n = icmp eq i64 %n.vec, %0
+  br i1 %cmp.n, label %do.end, label %do.body.preheader, !dbg !10
+
+do.body.preheader:                                ; preds = %middle.block, %entry
+  %indvars.iv.ph = phi i64 [ %0, %entry ], [ %ind.end, %middle.block ]
+  %Total.0.ph = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+  br label %do.body, !dbg !11
+
+do.body:                                          ; preds = %do.body.preheader, %do.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body ], [ %indvars.iv.ph, %do.body.preheader ]
+  %Total.0 = phi i32 [ %add, %do.body ], [ %Total.0.ph, %do.body.preheader ], !dbg !18
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1, !dbg !11
+  %arrayidx = getelementptr inbounds i8, i8* %Bytes, i64 %indvars.iv.next, !dbg !11
+  %15 = load i8, i8* %arrayidx, align 1, !dbg !11, !tbaa !12
+  %conv = sext i8 %15 to i32, !dbg !11
+  %add = add nsw i32 %Total.0, %conv, !dbg !11
+  %16 = icmp eq i64 %indvars.iv.next, 0
+  br i1 %16, label %do.end.loopexit, label %do.body, !dbg !11, !llvm.loop !19
+
+do.end.loopexit:                                  ; preds = %do.body
+  %add.lcssa11 = phi i32 [ %add, %do.body ], !dbg !11
+  br label %do.end, !dbg !21
+
+do.end:                                           ; preds = %do.end.loopexit, %middle.block
+  %add.lcssa = phi i32 [ %14, %middle.block ], [ %add.lcssa11, %do.end.loopexit ], !dbg !11
+  ret i32 %add.lcssa, !dbg !21
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "src2.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!""}
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 4, scope: !8)
+!11 = !DILocation(line: 5, scope: !8)
+!12 = !{!13, !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C++ TBAA"}
+!15 = distinct !{!15, !10, !16, !17}
+!16 = !DILocation(line: 6, scope: !8)
+!17 = !{!"llvm.loop.isvectorized", i32 1}
+!18 = !DILocation(line: 0, scope: !8)
+!19 = distinct !{!19, !10, !16, !20, !17}
+!20 = !{!"llvm.loop.unroll.runtime.disable"}
+!21 = !DILocation(line: 7, scope: !8)

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
new file mode 100644
index 000000000000..89cdbb88dc59
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+
+ at array = external addrspace(4) constant [32 x [800 x i32]], align 4
+
+; GCN-LABEL: {{^}}test_lsr_voidty:
+define amdgpu_kernel void @test_lsr_voidty() {
+entry:
+  br label %for.body
+
+for.body:                                 ; preds = %for.body.i, %entry
+  br label %for.body.i
+
+for.body.i:                               ; preds = %for.body.i, %for.body
+  %ij = phi i32 [ 0, %for.body ], [ %inc14, %for.body.i ]
+  %tmp = load i32, i32 addrspace(5)* undef, align 4
+  %inc13 = or i32 %ij, 2
+  %shl = shl i32 1, 0
+  %and = and i32 %shl, %tmp
+  %tobool = icmp eq i32 %and, 0
+  %add = mul nuw nsw i32 %inc13, 5
+  %tmp1 = zext i32 %add to i64
+  %arrayidx8 = getelementptr inbounds [32 x [800 x i32]], [32 x [800 x i32]] addrspace(4)* @array, i64 0, i64 undef, i64 %tmp1
+  %tmp2 = load i32, i32 addrspace(4)* %arrayidx8, align 4
+  %and9 = select i1 %tobool, i32 0, i32 %tmp2
+  %xor = xor i32 undef, %and9
+  %inc1 = or i32 %ij, 3
+  %add2 = mul nuw nsw i32 %inc1, 5
+  %add6 = add nuw nsw i32 %add2, 1
+  %tmp3 = zext i32 %add6 to i64
+  %arrayidx9 = getelementptr inbounds [32 x [800 x i32]], [32 x [800 x i32]] addrspace(4)* @array, i64 0, i64 undef, i64 %tmp3
+  %tmp4 = bitcast i32 addrspace(4)* %arrayidx9 to <4 x i32> addrspace(4)*
+  %tmp5 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp4, align 4
+  %reorder_shuffle2 = shufflevector <4 x i32> %tmp5, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %tmp6 = select <4 x i1> undef, <4 x i32> zeroinitializer, <4 x i32> %reorder_shuffle2
+  %inc14 = add nuw nsw i32 %ij, 4
+  br i1 undef, label %for.body, label %for.body.i
+}

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
index 6c21bcae05c6..28d512cc2e1e 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
@@ -11,7 +11,7 @@ define float @vctp8(float* %0, i32 %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 ; CHECK:       11:
@@ -39,7 +39,7 @@ define float @vctp8(float* %0, i32 %1) {
   %6 = ptrtoint float* %0 to i32
   %7 = insertelement <4 x i32> poison, i32 %6, i32 0
   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
-  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %9 = shufflevector <4 x i32> %8, <4 x i32> poison, <4 x i32> zeroinitializer
   %10 = add <4 x i32> %4, %9
   br label %11
 
@@ -72,7 +72,7 @@ define float @vctp16(float* %0, i32 %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 ; CHECK:       11:
@@ -100,7 +100,7 @@ define float @vctp16(float* %0, i32 %1) {
   %6 = ptrtoint float* %0 to i32
   %7 = insertelement <4 x i32> poison, i32 %6, i32 0
   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
-  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %9 = shufflevector <4 x i32> %8, <4 x i32> poison, <4 x i32> zeroinitializer
   %10 = add <4 x i32> %4, %9
   br label %11
 
@@ -133,7 +133,7 @@ define float @vctpi32(float* %0, i32 %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 ; CHECK:       11:
@@ -160,7 +160,7 @@ define float @vctpi32(float* %0, i32 %1) {
   %6 = ptrtoint float* %0 to i32
   %7 = insertelement <4 x i32> poison, i32 %6, i32 0
   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
-  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %9 = shufflevector <4 x i32> %8, <4 x i32> poison, <4 x i32> zeroinitializer
   %10 = add <4 x i32> %4, %9
   br label %11
 
@@ -193,7 +193,7 @@ define float @vctpi64(float* %0, i32 %1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 ; CHECK:       11:
@@ -220,7 +220,7 @@ define float @vctpi64(float* %0, i32 %1) {
   %6 = ptrtoint float* %0 to i32
   %7 = insertelement <4 x i32> poison, i32 %6, i32 0
   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
-  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %9 = shufflevector <4 x i32> %8, <4 x i32> poison, <4 x i32> zeroinitializer
   %10 = add <4 x i32> %4, %9
   br label %11
 

diff  --git a/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll b/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
index 9571f12098b6..e1ff19767087 100644
--- a/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
+++ b/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
@@ -18,7 +18,7 @@ define i8* @f(i8* returned %s, i32 zeroext %x, i32 signext %k) local_unnamed_add
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -16
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
@@ -208,7 +208,7 @@ for.body.lr.ph:                                   ; preds = %entry
 vector.ph:                                        ; preds = %for.body.lr.ph
   %n.vec = and i64 %wide.trip.count, 4294967280
   %broadcast.splatinsert = insertelement <16 x i32> poison, i32 %x, i32 0
-  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
+  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph

diff  --git a/llvm/test/Transforms/LoopUnroll/X86/pr46430-inseltpoison.ll b/llvm/test/Transforms/LoopUnroll/X86/pr46430-inseltpoison.ll
new file mode 100644
index 000000000000..4683f9bd69ea
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/X86/pr46430-inseltpoison.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-unroll -costmodel-reduxcost -S -o - %s | FileCheck %s
+
+define void @g() {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[F_EXIT:%.*]]
+; CHECK:       f.exit:
+; CHECK-NEXT:    [[RDX_SHUF9_I:%.*]] = shufflevector <4 x i16> <i16 1, i16 0, i16 undef, i16 undef>, <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10_I:%.*]] = xor <4 x i16> <i16 1, i16 0, i16 undef, i16 undef>, [[RDX_SHUF9_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX10_I]], i32 0
+; CHECK-NEXT:    br label [[F_EXIT]]
+;
+entry:
+  br label %f.exit
+
+f.exit:                                           ; preds = %f.exit, %entry
+  %rdx.shuf9.i = shufflevector <4 x i16> <i16 1, i16 0, i16 undef, i16 undef>, <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx10.i = xor <4 x i16> <i16 1, i16 0, i16 undef, i16 undef>, %rdx.shuf9.i
+  %0 = extractelement <4 x i16> %bin.rdx10.i, i32 0
+  br label %f.exit
+}
+

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
index c01c9562274a..789e48575ded 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@@ -35,34 +35,34 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
   %add = add <4 x i32> %num, <i32 1, i32 1, i32 1, i32 1>
   %div = sdiv i32 %k, 2
   %splatinsert = insertelement <4 x i32> poison, i32 %div, i32 0
-  %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
   %add1 = add <4 x i32> %add, %splat
   %mul = mul nsw i32 %p, 6234
   %splatinsert2 = insertelement <4 x i32> poison, i32 %mul, i32 0
-  %splat3 = shufflevector <4 x i32> %splatinsert2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat3 = shufflevector <4 x i32> %splatinsert2, <4 x i32> poison, <4 x i32> zeroinitializer
   %add4 = add <4 x i32> %add1, %splat3
   %mul5 = mul nsw i32 75, %h
   %splatinsert6 = insertelement <4 x i32> poison, i32 %mul5, i32 0
-  %splat7 = shufflevector <4 x i32> %splatinsert6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat7 = shufflevector <4 x i32> %splatinsert6, <4 x i32> poison, <4 x i32> zeroinitializer
   %add8 = add <4 x i32> %add4, %splat7
   %div9 = sdiv i32 %j, 3452
   %splatinsert10 = insertelement <4 x i32> poison, i32 %div9, i32 0
-  %splat11 = shufflevector <4 x i32> %splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat11 = shufflevector <4 x i32> %splatinsert10, <4 x i32> poison, <4 x i32> zeroinitializer
   %add12 = add <4 x i32> %add8, %splat11
   %mul13 = mul nsw i32 53, %w
   %splatinsert14 = insertelement <4 x i32> poison, i32 %mul13, i32 0
-  %splat15 = shufflevector <4 x i32> %splatinsert14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat15 = shufflevector <4 x i32> %splatinsert14, <4 x i32> poison, <4 x i32> zeroinitializer
   %add16 = add <4 x i32> %add12, %splat15
   %div17 = sdiv i32 %x, 820
   %splatinsert18 = insertelement <4 x i32> poison, i32 %div17, i32 0
-  %splat19 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat19 = shufflevector <4 x i32> %splatinsert18, <4 x i32> poison, <4 x i32> zeroinitializer
   %add20 = add <4 x i32> %add16, %splat19
   %mul21 = mul nsw i32 4, %u
   %splatinsert22 = insertelement <4 x i32> poison, i32 %mul21, i32 0
-  %splat23 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat23 = shufflevector <4 x i32> %splatinsert22, <4 x i32> poison, <4 x i32> zeroinitializer
   %add24 = add <4 x i32> %add20, %splat23
   %splatinsert25 = insertelement <4 x i32> poison, i32 %y, i32 0
-  %splat26 = shufflevector <4 x i32> %splatinsert25, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat26 = shufflevector <4 x i32> %splatinsert25, <4 x i32> poison, <4 x i32> zeroinitializer
   %add27 = add <4 x i32> %add24, %splat26
   %add28 = add <4 x i32> %add27, <i32 25, i32 25, i32 25, i32 25>
   %add29 = add <4 x i32> %add28, <i32 317400, i32 317400, i32 317400, i32 317400>

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/shuffle-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/shuffle-inseltpoison.ll
new file mode 100644
index 000000000000..bcb9a213f6ac
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/shuffle-inseltpoison.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O2                   -S -mattr=avx < %s | FileCheck %s
+; RUN: opt -passes='default<O2>' -S -mattr=avx < %s | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Eliminate redundant shuffles
+
+define <2 x i64> @shuffle_32_add_16_shuffle_32_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_32_add_16_shuffle_32_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[BC5:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[BC5]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <8 x i16>
+  %add.i = add <8 x i16> %bc2, %bc2
+  %bc3 = bitcast <8 x i16> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+; Eliminate redundant shuffles
+
+define <2 x i64> @shuffle_32_add_8_shuffle_32_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_32_add_8_shuffle_32_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[TMP1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[BC5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[BC5]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <16 x i8>
+  %add.i = add <16 x i8> %bc2, %bc2
+  %bc3 = bitcast <16 x i8> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+; Eliminate redundant shuffles
+
+define <2 x i64> @shuffle_8_add_32_shuffle_8_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_8_add_32_shuffle_8_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[BC5:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[BC5]]
+;
+  %bc0 = bitcast <2 x i64> %v to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %bc0, <16 x i8> zeroinitializer, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc1 = bitcast <16 x i8> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <4 x i32>
+  %add.i = add <4 x i32> %bc2, %bc2
+  %bc3 = bitcast <4 x i32> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <16 x i8>
+  %shuffle4 = shufflevector <16 x i8> %bc4, <16 x i8> zeroinitializer, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc5 = bitcast <16 x i8> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+; Single shuffle should sink.
+
+define <8 x i16> @shuffle_32_add_16_masks_are_eq(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_32_add_16_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[V2:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <8 x i16> [[ADD]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %v2, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %bc2 = bitcast <4 x i32> %shuffle2 to <8 x i16>
+  %add = add <8 x i16> %bc1, %bc2
+  ret <8 x i16> %add
+}
+
+; Sink single shuffle.
+
+define <16 x i8> @shuffle_32_add_8_masks_are_eq(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_32_add_8_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[V2:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <16 x i8> [[ADD]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %v2, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %bc2 = bitcast <4 x i32> %shuffle2 to <16 x i8>
+  %add = add <16 x i8> %bc1, %bc2
+  ret <16 x i8> %add
+}
+
+; Sink single shuffle.
+
+define <16 x i8> @shuffle_16_add_8_masks_are_eq(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: @shuffle_16_add_8_masks_are_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[V2:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    ret <16 x i8> [[ADD]]
+;
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 4, i32 5, i32 7, i32 6>
+  %shuffle2 = shufflevector <8 x i16> %v2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 4, i32 5, i32 7, i32 6>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <16 x i8>
+  %bc2 = bitcast <8 x i16> %shuffle2 to <16 x i8>
+  %add = add <16 x i8> %bc1, %bc2
+  ret <16 x i8> %add
+}
+
+; Sink single shuffle.
+
+define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[V2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[ADD]]
+;
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i16> %v2, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %bc2 = bitcast <8 x i16> %shuffle2 to <4 x i32>
+  %add = add <4 x i32> %bc1, %bc2
+  ret <4 x i32> %add
+}
+
+; Sink single shuffle.
+
+define <4 x i32> @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[V2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[ADD]]
+;
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle2 = shufflevector <16 x i8> %v2, <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %bc2 = bitcast <16 x i8> %shuffle2 to <4 x i32>
+  %add = add <4 x i32> %bc1, %bc2
+  ret <4 x i32> %add
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<4 x i32>(v)))
+; TODO: Squash shuffles and widen type?
+
+define <8 x i16> @shuffle_32_bitcast_16_shuffle_16_can_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_16_shuffle_16_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<4 x i32>(v)))
+; TODO: Squash shuffles?
+
+define <8 x i16> @shuffle_32_bitcast_16_shuffle_16_can_not_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_16_shuffle_16_can_not_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<16 x i8>( bitcast<16 x i8>( shuffle<4 x i32>(v)))
+; TODO: Squash shuffles and widen type?
+
+define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_8_shuffle_8_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <16 x i8> [[BC1]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <16 x i8> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %shuffle2 = shufflevector <16 x i8> %bc1, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i8> %shuffle2
+}
+
+; shuffle<16 x i8>( bitcast<16 x i8>( shuffle<4 x i32>(v)))
+; TODO: Squash shuffles?
+
+define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V1:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <16 x i8> [[BC1]], <16 x i8> poison, <16 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %shuffle2 = shufflevector <16 x i8> %bc1, <16 x i8> poison, <16 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v)))
+; TODO: squash shuffles?
+
+define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v)))
+; TODO: squash shuffles?
+
+define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(<8 x i16> %v1) {
+; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v)))
+; TODO: Narrow and squash shuffles?
+
+define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_not_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_not_be_converted_up(
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v)))
+; TODO: Narrow and squash shuffles?
+
+define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up(<8 x i16> %v1) {
+; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up(
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v)))
+; TODO: squash shuffles and widen type?
+
+define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v)))
+; TODO: Narrow and squash shuffles?
+
+define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can_not_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can_not_be_converted_up(
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE2]]
+;
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}

diff  --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
index 26fb97278d72..d785764e3366 100644
--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
@@ -157,7 +157,7 @@ loop:
   %add = add <4 x i32> %ext, %acc
   %cmp = icmp slt <4 x i32> %add, <i32 -10, i32 -11, i32 -12, i32 -13>
   %single = insertelement <4 x i32> poison, i32 %i, i32 0
-  %limit = shufflevector <4 x i32> %single, <4 x i32> undef,
+  %limit = shufflevector <4 x i32> %single, <4 x i32> poison,
                          <4 x i32> zeroinitializer
   %sel = select <4 x i1> %cmp, <4 x i32> %add, <4 x i32> %limit
   %trunc = trunc <4 x i32> %sel to <4 x i8>

diff  --git a/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
index 718018b28523..454f2b39a8de 100644
--- a/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
@@ -13,7 +13,7 @@ bb1:
   %_tmp7 = tail call i16 @f1(), !dbg !13
 ; CHECK: call i16 @f1(), !dbg !13
   %broadcast.splatinsert5 = insertelement <4 x i16> poison, i16 %_tmp7, i32 0
-  %broadcast.splat6 = shufflevector <4 x i16> %broadcast.splatinsert5, <4 x i16> undef, <4 x i32> zeroinitializer
+  %broadcast.splat6 = shufflevector <4 x i16> %broadcast.splatinsert5, <4 x i16> poison, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:

diff  --git a/llvm/test/Transforms/Scalarizer/order-bug-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/order-bug-inseltpoison.ll
index fd7e009166ef..eb1ca57ebedf 100644
--- a/llvm/test/Transforms/Scalarizer/order-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/order-bug-inseltpoison.ll
@@ -14,7 +14,7 @@ y:
 ; CHECK: %f.upto1 = insertelement <4 x i32> %f.upto0, i32 %b.i0, i32 1
 ; CHECK: %f.upto2 = insertelement <4 x i32> %f.upto1, i32 %b.i0, i32 2
 ; CHECK: %f = insertelement <4 x i32> %f.upto2, i32 %b.i0, i32 3
-  %f = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+  %f = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
   ret <4 x i32> %f
 
 z:

diff  --git a/llvm/test/Transforms/Scalarizer/phi-bug-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/phi-bug-inseltpoison.ll
new file mode 100644
index 000000000000..cf5488e65480
--- /dev/null
+++ b/llvm/test/Transforms/Scalarizer/phi-bug-inseltpoison.ll
@@ -0,0 +1,25 @@
+; RUN: opt %s -scalarizer -verify -S -o - | FileCheck %s
+; RUN: opt %s -passes='function(scalarizer,verify)' -S -o - | FileCheck %s
+
+define void @f3() local_unnamed_addr {
+bb1:
+  br label %bb2
+
+bb3:
+; CHECK-LABEL: bb3:
+; CHECK-NEXT: br label %bb4
+  %h.10.0.vec.insert = shufflevector <1 x i16> %h.10.1, <1 x i16> poison, <1 x i32> <i32 0>
+  br label %bb4
+
+bb2:
+; CHECK-LABEL: bb2:
+; CHECK: phi i16
+  %h.10.1 = phi <1 x i16> [ undef, %bb1 ]
+  br label %bb3
+
+bb4:
+; CHECK-LABEL: bb4:
+; CHECK: phi i16
+  %h.10.2 = phi <1 x i16> [ %h.10.0.vec.insert, %bb3 ]
+  ret void
+}

diff  --git a/llvm/test/Transforms/SpeculativeExecution/spec-other-inseltpoison.ll b/llvm/test/Transforms/SpeculativeExecution/spec-other-inseltpoison.ll
index c2eb266abbb0..9dd7c05ea9c9 100644
--- a/llvm/test/Transforms/SpeculativeExecution/spec-other-inseltpoison.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/spec-other-inseltpoison.ll
@@ -9,7 +9,7 @@ define void @ifThen_shuffle() {
   br i1 true, label %a, label %b
 
 a:
-  %x = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+  %x = shufflevector <2 x float> undef, <2 x float> poison, <2 x i32> zeroinitializer
   br label %b
 
 b:

diff  --git a/llvm/test/Transforms/VectorCombine/AArch64/vscale-bitcast-shuffle-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/AArch64/vscale-bitcast-shuffle-inseltpoison.ll
new file mode 100644
index 000000000000..99ec8f7f10d5
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/vscale-bitcast-shuffle-inseltpoison.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -vector-combine -S -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test checks we are not crashing with TTI when trying to get shuffle cost.
+; This test also check that shuffle mask <vscale x 4 x i32> zeroinitializer is
+; not narrowed into <0, 1, 0, 1, ...>, which we cannot reason if it's a valid
+; splat or not.
+
+define <vscale x 8 x i16> @bitcast_shuffle(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @bitcast_shuffle(
+; CHECK-NEXT:    [[I:%.*]] = shufflevector <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = bitcast <vscale x 4 x i32> [[I]] to <vscale x 8 x i16>
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[R]]
+;
+  %i = shufflevector <vscale x 4 x i32> %a, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %r = bitcast <vscale x 4 x i32> %i to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %r
+}

diff  --git a/llvm/test/Transforms/VectorCombine/X86/no-sse-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/no-sse-inseltpoison.ll
new file mode 100644
index 000000000000..66d453f9fb09
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/no-sse-inseltpoison.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=-sse | FileCheck %s
+
+; Don't spend time on vector transforms if the target does not support vectors.
+
+define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
+; CHECK-LABEL: @bitcast_shuf_same_size(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %shuf = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %r = bitcast <4 x i32> %shuf to <4 x float>
+  ret <4 x float> %r
+}

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
new file mode 100644
index 000000000000..4dbea1d5e705
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb)
+
+define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
+; SSE-LABEL: @bitcast_shuf_narrow_element(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    ret <16 x i8> [[R]]
+;
+  %shuf = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %r = bitcast <4 x i32> %shuf to <16 x i8>
+  ret <16 x i8> %r
+}
+
+; v4f32 is the same cost as v4i32, so this always works
+
+define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
+; CHECK-LABEL: @bitcast_shuf_same_size(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %shuf = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %r = bitcast <4 x i32> %shuf to <4 x float>
+  ret <4 x float> %r
+}
+
+; Negative test - length-changing shuffle
+
+define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
+; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %r = bitcast <4 x i32> %shuf to <16 x i8>
+  ret <16 x i8> %r
+}
+
+; Negative test - must cast to vector type
+
+define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
+; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to i128
+; CHECK-NEXT:    ret i128 [[R]]
+;
+  %shuf = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %r = bitcast <4 x i32> %shuf to i128
+  ret i128 %r
+}
+
+; Widen shuffle elements
+
+define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
+; CHECK-LABEL: @bitcast_shuf_wide_element(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %shuf = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
+  %r = bitcast <8 x i16> %shuf to <4 x i32>
+  ret <4 x i32> %r
+}
+
+declare void @use(<4 x i32>)
+
+; Negative test - don't create an extra shuffle
+
+define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
+; CHECK-LABEL: @bitcast_shuf_uses(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @use(<4 x i32> [[SHUF]])
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %shuf = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  call void @use(<4 x i32> %shuf)
+  %r = bitcast <4 x i32> %shuf to <16 x i8>
+  ret <16 x i8> %r
+}
+
+define <2 x i64> @PR35454_1(<2 x i64> %v) {
+; SSE-LABEL: @PR35454_1(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
+; SSE-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_1(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8>
+; AVX-NEXT:    [[BC1:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
+;
+  %bc = bitcast <2 x i64> %v to <4 x i32>
+  %permil = shufflevector <4 x i32> %bc, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %permil to <16 x i8>
+  %add = shl <16 x i8> %bc1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %bc2 = bitcast <16 x i8> %add to <4 x i32>
+  %permil1 = shufflevector <4 x i32> %bc2, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc3 = bitcast <4 x i32> %permil1 to <2 x i64>
+  ret <2 x i64> %bc3
+}
+
+define <2 x i64> @PR35454_2(<2 x i64> %v) {
+; SSE-LABEL: @PR35454_2(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
+; SSE-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_2(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16>
+; AVX-NEXT:    [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
+; AVX-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
+;
+  %bc = bitcast <2 x i64> %v to <4 x i32>
+  %permil = shufflevector <4 x i32> %bc, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %permil to <8 x i16>
+  %add = shl <8 x i16> %bc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %bc2 = bitcast <8 x i16> %add to <4 x i32>
+  %permil1 = shufflevector <4 x i32> %bc2, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc3 = bitcast <4 x i32> %permil1 to <2 x i64>
+  ret <2 x i64> %bc3
+}