[llvm] [SLP] Avoid -passes=instcombine stages in SLP tests (PR #146257)

Sun Jun 29 12:50:48 PDT 2025

https://github.com/ericxu233 updated https://github.com/llvm/llvm-project/pull/146257

>From fd1d877baa031d10d49134fb227fef64b4a1219f Mon Sep 17 00:00:00 2001
From: ericxu233 <xuhanyang5 at gmail.com>
Date: Sat, 28 Jun 2025 19:05:01 -0400
Subject: [PATCH 1/3] [SLP] Avoid -passes=instcombine stages in SLP tests

---
 .../Transforms/SLPVectorizer/AArch64/gather-cost.ll  |  4 ++--
 .../SLPVectorizer/AArch64/gather-reduce.ll           |  4 ++--
 .../SLPVectorizer/AArch64/getelementptr.ll           |  4 ++--
 .../SLPVectorizer/AArch64/getelementptr2.ll          |  4 ++--
 .../Transforms/SLPVectorizer/AArch64/loadorder.ll    |  2 +-
 .../SLPVectorizer/AArch64/transpose-inseltpoison.ll  |  2 +-
 .../Transforms/SLPVectorizer/AArch64/transpose.ll    |  2 +-
 .../SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll |  6 +++---
 .../Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll   |  6 +++---
 llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll |  6 +++---
 .../SLPVectorizer/WebAssembly/no-vectorize-rotate.ll |  2 +-
 .../X86/alternate-calls-inseltpoison.ll              | 12 ++++++------
 .../Transforms/SLPVectorizer/X86/alternate-calls.ll  | 12 ++++++------
 .../SLPVectorizer/X86/alternate-cast-inseltpoison.ll | 12 ++++++------
 .../Transforms/SLPVectorizer/X86/alternate-cast.ll   | 12 ++++++------
 .../SLPVectorizer/X86/alternate-fp-inseltpoison.ll   | 12 ++++++------
 .../Transforms/SLPVectorizer/X86/alternate-fp.ll     | 12 ++++++------
 .../SLPVectorizer/X86/alternate-int-inseltpoison.ll  | 12 ++++++------
 .../Transforms/SLPVectorizer/X86/alternate-int.ll    | 12 ++++++------
 .../X86/blending-shuffle-inseltpoison.ll             |  2 +-
 .../Transforms/SLPVectorizer/X86/blending-shuffle.ll |  2 +-
 .../SLPVectorizer/X86/cmp_commute-inseltpoison.ll    |  4 ++--
 .../test/Transforms/SLPVectorizer/X86/cmp_commute.ll |  4 ++--
 .../SLPVectorizer/X86/hadd-inseltpoison.ll           | 12 ++++++------
 llvm/test/Transforms/SLPVectorizer/X86/hadd.ll       | 12 ++++++------
 .../SLPVectorizer/X86/hsub-inseltpoison.ll           | 12 ++++++------
 llvm/test/Transforms/SLPVectorizer/X86/hsub.ll       | 12 ++++++------
 .../Transforms/SLPVectorizer/X86/minimum-sizes.ll    | 10 +++++-----
 .../Transforms/SLPVectorizer/X86/operandorder.ll     |  4 ++--
 llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll    |  2 +-
 llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll    | 10 +++++-----
 llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll    | 10 +++++-----
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll        | 10 +++++-----
 llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll    | 10 +++++-----
 llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll    |  2 +-
 llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll    |  2 +-
 .../SLPVectorizer/X86/reverse_extract_elements.ll    |  2 +-
 .../Transforms/SLPVectorizer/revec-shufflevector.ll  |  2 +-
 38 files changed, 131 insertions(+), 131 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index e460f558f4723..e857cb8136410 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=slp-vectorizer,instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -passes=slp-vectorizer -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer' -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index 900d5f293b5b8..f902b2cfa2ea0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=GENERIC
-; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=KRYO
+; RUN: opt -S -passes=slp-vectorizer,dce < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer,dce < %s | FileCheck %s --check-prefix=KRYO
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index c43b79e138a30..01b16bb7db619 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 5808d64d925c6..7aa4c835fc7f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index f138065101c4b..ea02bdea36a7e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer,instcombine -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index c84c333391350..2a92a84270981 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index e4fcb1ed08be9..6e0b9a9447386 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 783a1e83c6724..e5eb33547a68b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 7e31ec9a0b39a..ae9b881399343 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
index 46c6c10125b95..17045d47c8c84 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
index 7faf11c34d76e..a47cd575cf8e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
 
 ; Regression test for a bug in the SLP vectorizer that was causing
 ; these rotates to be incorrectly combined into a vector rotate.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 77d36f0107665..42b39bf7f3590 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index 18d79752b0b44..b0bc7647a755b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 3e2c305dbed65..2d60078ca1eac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-LABEL: @sitofp_uitofp(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 880523d6474ac..1afb4298447c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-LABEL: @sitofp_uitofp(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 5cee6984df04f..79179ed74c9b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index 9a2f959ac63bc..f510b5f978b4f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index f8c5df9944538..daca280d7bc84 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index b84ef027f67c5..6ba4330272329 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index 3ee6c55e429bd..8fc2576d18008 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 60e008d12e6e1..2eb45fe61df09 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
index fd5f09bf2adc0..7d5ea22436c7c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
 
 ;
 ; Check that we can commute operands based on the predicate.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
index 35619d6d3ad1d..25aa24b4cc81d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
 
 ;
 ; Check that we can commute operands based on the predicate.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index b85ec5bce8192..08fd31cc32a5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index b83d35541bbae..19435f2918b9f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 40b6a8c32f5d0..33de35dbcb6c9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 09113323d3ab7..8300b51cfbf95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 9c9a89ac76d8b..69cd8cab2df14 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+sse2     -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx      -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx2     -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512f  -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 61938d01e57ac..4598112dce799 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
index 2588012847d09..a444db1a55ae7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
 
 define void @test1(float %a, float %b, float %c, float %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test1(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index bae127527bbbe..362ce9e460c24 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index a9c0eb3f9f2b9..406d2e660dc9f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 
 @b = global [8 x i32] zeroinitializer, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index d2617a1986764..9273333f1d6b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 8f31200a3683d..bd6dbdeafffc3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index 42a50384787c8..b4b6f43d77785 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s | FileCheck %s
 ; These code should be fully vectorized by D57059 patch
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
index e7239f906c59d..4623db5ce7060 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s
 
 ; These conversions should be vectorized by reviews.llvm.org/D57059
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
index 54120f5402f57..70ab6d09a9236 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
 define float @dotf(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @dotf(
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index b85c78ec8d2d0..c38ca3f7ee24b 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
-; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE
+; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE
 
 define void @test1(ptr %in, ptr %out) {
 ; CHECK-LABEL: @test1(

>From 3b18b7912cf626c487c3a911e4b9e126a3341872 Mon Sep 17 00:00:00 2001
From: ericxu233 <xuhanyang5 at gmail.com>
Date: Sat, 28 Jun 2025 19:06:52 -0400
Subject: [PATCH 2/3] Added updated checks

---
 .../SLPVectorizer/AArch64/gather-cost.ll      |  36 +-
 .../SLPVectorizer/AArch64/gather-reduce.ll    | 132 +++---
 .../SLPVectorizer/AArch64/getelementptr.ll    |  56 +--
 .../SLPVectorizer/AArch64/getelementptr2.ll   |  22 +-
 .../SLPVectorizer/AArch64/loadorder.ll        | 353 ++++++++++----
 .../AArch64/transpose-inseltpoison.ll         |   4 +-
 .../SLPVectorizer/AArch64/transpose.ll        |   4 +-
 .../AMDGPU/add_sub_sat-inseltpoison.ll        |  30 +-
 .../SLPVectorizer/AMDGPU/add_sub_sat.ll       |  50 +-
 .../SLPVectorizer/AMDGPU/min_max.ll           |  52 +-
 .../WebAssembly/no-vectorize-rotate.ll        |  10 +-
 .../X86/alternate-calls-inseltpoison.ll       |  92 ++--
 .../SLPVectorizer/X86/alternate-calls.ll      | 104 ++--
 .../X86/alternate-cast-inseltpoison.ll        |  68 ++-
 .../SLPVectorizer/X86/alternate-cast.ll       |  68 ++-
 .../X86/alternate-fp-inseltpoison.ll          | 111 +++--
 .../SLPVectorizer/X86/alternate-fp.ll         | 111 +++--
 .../X86/alternate-int-inseltpoison.ll         | 227 +++++----
 .../SLPVectorizer/X86/alternate-int.ll        | 227 +++++----
 .../X86/blending-shuffle-inseltpoison.ll      |  32 +-
 .../SLPVectorizer/X86/blending-shuffle.ll     |  42 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    |  66 +--
 .../test/Transforms/SLPVectorizer/X86/hadd.ll |  78 +--
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    |  60 +--
 .../test/Transforms/SLPVectorizer/X86/hsub.ll |  60 +--
 .../SLPVectorizer/X86/minimum-sizes.ll        |  64 +--
 .../SLPVectorizer/X86/operandorder.ll         |  82 ++--
 .../Transforms/SLPVectorizer/X86/pr40522.ll   |  32 +-
 .../Transforms/SLPVectorizer/X86/pr46983.ll   |  20 +-
 .../Transforms/SLPVectorizer/X86/pr47623.ll   |  16 +-
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 443 +++++++++---------
 .../Transforms/SLPVectorizer/X86/pr47629.ll   | 443 +++++++++---------
 .../Transforms/SLPVectorizer/X86/pr47642.ll   |   8 +-
 .../Transforms/SLPVectorizer/X86/pr49081.ll   |   5 +-
 .../SLPVectorizer/revec-shufflevector.ll      |  91 +++-
 35 files changed, 1910 insertions(+), 1389 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index e857cb8136410..8dc88f7b96716 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -16,10 +16,10 @@ target triple = "aarch64--linux-gnu"
 
 define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @gather_multiple_use(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], splat (i32 65535)
@@ -57,22 +57,26 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 @data = global [6 x [258 x i8]] zeroinitializer, align 1
 define void @gather_load(ptr noalias %ptr) {
 ; CHECK-LABEL: @gather_load(
-; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 8
-; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 258), align 1
+; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX149:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0
+; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr [[ARRAYIDX149]], align 1
 ; CHECK-NEXT:    [[CONV150:%.*]] = zext i8 [[L0]] to i16
-; CHECK-NEXT:    [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 517), align 1
+; CHECK-NEXT:    [[ADD152:%.*]] = add i16 10, [[CONV150]]
+; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1
+; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[ARRAYIDX155]], align 1
 ; CHECK-NEXT:    [[CONV156:%.*]] = zext i8 [[L1]] to i16
-; CHECK-NEXT:    [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 776), align 1
+; CHECK-NEXT:    [[ADD158:%.*]] = add i16 20, [[CONV156]]
+; CHECK-NEXT:    [[ARRAYIDX161:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2
+; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[ARRAYIDX161]], align 1
 ; CHECK-NEXT:    [[CONV162:%.*]] = zext i8 [[L2]] to i16
-; CHECK-NEXT:    [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 1035), align 1
+; CHECK-NEXT:    [[ADD164:%.*]] = add i16 30, [[CONV162]]
+; CHECK-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3
+; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[ARRAYIDX167]], align 1
 ; CHECK-NEXT:    [[CONV168:%.*]] = zext i8 [[L3]] to i16
-; CHECK-NEXT:    [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40
+; CHECK-NEXT:    [[ADD170:%.*]] = add i16 40, [[CONV168]]
 ; CHECK-NEXT:    store i16 [[ADD152]], ptr [[ARRAYIDX182]], align 2
 ; CHECK-NEXT:    store i16 [[ADD158]], ptr [[ARRAYIDX183]], align 2
 ; CHECK-NEXT:    store i16 [[ADD164]], ptr [[ARRAYIDX184]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index f902b2cfa2ea0..4f8fc339fb64b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,57 +36,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; GENERIC-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
-; GENERIC-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]]
 ; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
-; GENERIC-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]]
 ; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
-; GENERIC-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]]
 ; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
-; GENERIC-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
+; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]]
 ; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
-; GENERIC-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
+; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]]
 ; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
-; GENERIC-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]]
 ; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
-; GENERIC-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
+; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]]
 ; GENERIC-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
-; GENERIC-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
+; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]]
 ; GENERIC-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
 ; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP28]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
@@ -109,57 +101,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; KRYO-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
-; KRYO-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
+; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]]
 ; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
-; KRYO-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]]
 ; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
-; KRYO-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
+; KRYO-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]]
 ; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
-; KRYO-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
+; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]]
 ; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
-; KRYO-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
+; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]]
 ; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
-; KRYO-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
+; KRYO-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]]
 ; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
-; KRYO-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
+; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]]
 ; KRYO-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
-; KRYO-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
+; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]]
 ; KRYO-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
 ; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP28]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
@@ -293,55 +277,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; GENERIC-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
+; GENERIC-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
 ; GENERIC-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
 ; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
 ; GENERIC-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
 ; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
+; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
 ; GENERIC-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
 ; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
+; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
 ; GENERIC-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
 ; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
+; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
 ; GENERIC-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
 ; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
+; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
 ; GENERIC-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
 ; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
+; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
 ; GENERIC-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
 ; GENERIC-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
+; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; GENERIC-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
 ; GENERIC-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
@@ -366,55 +350,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; KRYO-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; KRYO-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
+; KRYO-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
 ; KRYO-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
 ; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
 ; KRYO-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
 ; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
+; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
 ; KRYO-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
 ; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
+; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
 ; KRYO-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
 ; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
+; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
 ; KRYO-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
 ; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
+; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
 ; KRYO-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
 ; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
+; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
 ; KRYO-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
 ; KRYO-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
+; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; KRYO-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
 ; KRYO-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 01b16bb7db619..0a8ad26c900f8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -46,9 +46,9 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -58,29 +58,25 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -143,7 +139,7 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -153,26 +149,22 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = zext nneg i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[T11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0)
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 7aa4c835fc7f5..86d670551c10b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -25,42 +25,42 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) {
 ; CHECK-LABEL: @test_i16_extend(
 ; CHECK-NEXT:    [[P_0:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
-; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds nuw i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds nuw i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
+; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
+; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[T53]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[T56]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[T60]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[T71]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[T82]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[L_4:%.*]] = load i32, ptr [[T93]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i64 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[L_5:%.*]] = load i32, ptr [[T104]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i64 5
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
 ; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[L_6:%.*]] = load i32, ptr [[T115]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i64 6
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
 ; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[L_7:%.*]] = load i32, ptr [[T126]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[L_8:%.*]] = load i32, ptr [[T137]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index ea02bdea36a7e..04bd969da6304 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -133,7 +133,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -165,7 +165,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX32]], align 2
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX33]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX36]], align 2
@@ -254,9 +254,9 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
 ; CHECK-NEXT:    [[L1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 2
 ; CHECK-NEXT:    [[L2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -269,10 +269,12 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM9]]
 ; CHECK-NEXT:    [[L6:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[L8:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
 ; CHECK-NEXT:    [[L9:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 2
 ; CHECK-NEXT:    [[L10:%.*]] = load i16, ptr [[ARRAYIDX16]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[L12:%.*]] = load i16, ptr [[ARRAYIDX20]], align 2
@@ -347,9 +349,9 @@ define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
 ; CHECK-NEXT:    ret i16 [[TMP7]]
 ;
@@ -414,33 +416,127 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP38:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP38]], <4 x i8> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP5]], i64 12)
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP45:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP46:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP45]], <4 x i8> [[TMP12]], i64 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP46]], <4 x i8> [[TMP3]], i64 8)
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP47]], <4 x i8> [[TMP13]], i64 12)
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <16 x i32> [[TMP11]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]])
 ; CHECK-NEXT:    ret i32 [[TMP21]]
 ;
@@ -677,7 +773,7 @@ entry:
 define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) {
 ; CHECK-LABEL: @store_blockstrided3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64
@@ -696,7 +792,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
 ; CHECK-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
@@ -705,20 +801,21 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
-; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7
 ; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
@@ -727,9 +824,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
 ; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -837,9 +935,10 @@ define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[DST0:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -912,35 +1011,131 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0:%.*]], i64 16
-; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 32
-; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 48
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
+; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
+; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> [[TMP16]], [[TMP18]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[DST0]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[DST4]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST8]], align 4
@@ -1198,20 +1393,20 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
@@ -1225,33 +1420,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP13]], <4 x i8> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP14]], <4 x i8> [[TMP8]], i64 8)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP12]], i64 12)
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP20]], <4 x i8> [[TMP9]], i64 8)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP19]], i64 12)
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP29]], <4 x i8> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP30]], <4 x i8> [[TMP10]], i64 8)
+; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP28]], <4 x i8> [[TMP27]], i64 12)
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP35:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP36:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP35]], <4 x i8> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP37:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP36]], <4 x i8> [[TMP11]], i64 8)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP37]], <4 x i8> [[TMP34]], i64 12)
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16)
@@ -1259,11 +1450,11 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP45]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
 ; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP50]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]]
@@ -1271,7 +1462,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP61:%.*]] = and <16 x i32> [[TMP60]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], splat (i32 65535)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 2a92a84270981..0f47c6b3ac902 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 6e0b9a9447386..370ed1f258aca 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index e5eb33547a68b..a8d1c94d59be3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index ae9b881399343..b09022e8289a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -47,7 +47,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -82,7 +82,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -117,7 +117,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -152,7 +152,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -177,7 +177,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -202,7 +202,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -227,7 +227,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -255,7 +255,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
@@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -309,7 +315,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
 ; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
@@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
index 17045d47c8c84..57ca3db075689 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -24,7 +24,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -54,7 +54,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -66,7 +66,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -96,7 +96,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -108,7 +108,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -138,7 +138,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -150,7 +150,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -180,7 +180,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -205,7 +205,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -230,7 +230,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -255,7 +255,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -283,7 +283,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
@@ -299,7 +299,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
@@ -308,8 +308,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -344,7 +347,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
 ; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
@@ -358,9 +361,10 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP1:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP3]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -368,10 +372,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX9-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP8:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP7]])
 ; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
index a47cd575cf8e3..88a03bb72c276 100644
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@@ -9,16 +9,16 @@ target triple = "wasm32-unknown-unknown"
 
 define void @foo(<2 x i64> %x, <4 x i32> %y, ptr %out) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 2
+; CHECK-NEXT:    [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[B]] to i64
 ; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
 ; CHECK-NEXT:    store i64 [[C]], ptr [[OUT:%.*]], align 8
-; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i64> [[X]], i64 1
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i64 3
+; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
 ; CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[E]] to i64
 ; CHECK-NEXT:    [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[OUT]], i32 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[OUT]], i32 1
 ; CHECK-NEXT:    store i64 [[F]], ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 42b39bf7f3590..92ffbad73d5f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -8,14 +8,14 @@
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
 ; SSE-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
 ; SSE-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
@@ -24,31 +24,31 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
 ; SSE-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
 ; SSE-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; SSE-NEXT:    [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
-; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x float> [[R71]]
 ;
 ; SLM-LABEL: @ceil_floor(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -56,20 +56,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX-LABEL: @ceil_floor(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]])
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -77,20 +77,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index b0bc7647a755b..f504821b5fa67 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -8,14 +8,14 @@
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
 ; SSE-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
 ; SSE-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
@@ -24,75 +24,75 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
 ; SSE-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
 ; SSE-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; SSE-NEXT:    [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
-; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x float> [[R71]]
 ;
 ; SLM-LABEL: @ceil_floor(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX-LABEL: @ceil_floor(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]])
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x float> [[R71]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 2d60078ca1eac..65e5458b25d2f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -133,36 +137,50 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-LABEL: @fneg_fabs(
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; SLM-LABEL: @fneg_fabs(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
@@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
@@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 1afb4298447c3..fad46870ec475 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -133,36 +137,50 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-LABEL: @fneg_fabs(
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; SLM-LABEL: @fneg_fabs(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
@@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
@@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 79179ed74c9b8..99b13bdc05082 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -8,27 +8,39 @@
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fadd_fsub_v8f32(
@@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v8f32(
@@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 
 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SSE-LABEL: @fmul_fdiv_v4f32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v4f32_const(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX2-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index f510b5f978b4f..7f9475917b566 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -8,27 +8,39 @@
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fadd_fsub_v8f32(
@@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v8f32(
@@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 
 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SSE-LABEL: @fmul_fdiv_v4f32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v4f32_const(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX2-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index daca280d7bc84..11ab7770a5383 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -8,19 +8,25 @@
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @ashr_lshr_shl_v8i32(
@@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
@@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
@@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
@@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 ; SSE-LABEL: @sdiv_v8i32_undefs(
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @sdiv_v8i32_undefs(
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SLM-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SLM-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SLM-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SLM-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SLM-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SLM-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SLM-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @sdiv_v8i32_undefs(
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; AVX1-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; AVX1-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX1-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX1-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @sdiv_v8i32_undefs(
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX2-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX2-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX2-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX2-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX2-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @sdiv_v8i32_undefs(
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX512-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX512-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX512-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX512-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX512-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX512-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-LABEL: @add_sub_v8i32_splat(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 6ba4330272329..9589ec24d49d4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -8,19 +8,25 @@
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @ashr_lshr_shl_v8i32(
@@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
@@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
@@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
@@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 ; SSE-LABEL: @sdiv_v8i32_undefs(
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @sdiv_v8i32_undefs(
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SLM-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SLM-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SLM-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SLM-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SLM-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SLM-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SLM-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @sdiv_v8i32_undefs(
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; AVX1-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; AVX1-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX1-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX1-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @sdiv_v8i32_undefs(
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX2-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX2-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX2-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX2-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX2-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @sdiv_v8i32_undefs(
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX512-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX512-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX512-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX512-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX512-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX512-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-LABEL: @add_sub_v8i32_splat(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index 8fc2576d18008..c2bd9e648e4fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -86,8 +86,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
@@ -107,14 +107,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
@@ -136,14 +136,14 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 2eb45fe61df09..e4e45eb99fbdf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -39,8 +39,14 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 
 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> <i8 undef, i8 poison, i8 poison, i8 poison>, <4 x i32> <i32 4, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> undef, i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[X0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[X3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Y1]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[Y2]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[TMP2]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i8> [[TMP3]]
 ;
@@ -87,8 +93,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
@@ -108,14 +114,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
@@ -137,14 +143,14 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 08fd31cc32a5f..815ad044ca077 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -214,14 +214,14 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @test_v4f64_partial_swizzle(
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
@@ -243,21 +243,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -305,21 +305,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -351,21 +351,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -413,21 +413,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 19435f2918b9f..fb037b87da546 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -214,15 +214,15 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; SSE-NEXT:    [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64_partial_swizzle(
@@ -232,21 +232,21 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, double [[R0]], i64 0
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> zeroinitializer, double [[R0]], i32 0
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64_partial_swizzle(
-; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; AVX-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; AVX-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
-; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; AVX-NEXT:    [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3
 ; AVX-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
@@ -268,21 +268,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -330,21 +330,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -376,21 +376,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -438,21 +438,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 33de35dbcb6c9..6f655b3e9ae0b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 8300b51cfbf95..2ab81d9b8d308 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 69cd8cab2df14..58fb5f772207d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -17,15 +17,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_zext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -33,15 +33,15 @@ define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_zext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -76,15 +76,15 @@ entry:
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_sext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i32
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i32
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -92,15 +92,15 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_sext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i32
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i32
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 4598112dce799..de72521345435 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -10,16 +10,16 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
 ; CHECK-LABEL: @shuffle_operands1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @shuffle_operands1(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1
 ; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    ret void
@@ -43,7 +43,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -57,7 +57,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -91,7 +91,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -105,7 +105,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -139,7 +139,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -153,7 +153,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -185,12 +185,12 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
@@ -205,7 +205,7 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -239,7 +239,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -253,7 +253,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -286,13 +286,13 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -306,7 +306,7 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -347,23 +347,20 @@ define void @good_load_order() {
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
 ; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
@@ -382,23 +379,20 @@ define void @good_load_order() {
 ; SSE2:       for.body3:
 ; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
-; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
-; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
-; SSE2-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
+; SSE2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]]
+; SSE2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; SSE2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
 ; SSE2-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
 ; SSE2-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
 ; SSE2-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; SSE2-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; SSE2-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
 ; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; SSE2-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
index a444db1a55ae7..6fd2de8ad8ab5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
@@ -4,10 +4,10 @@
 define void @test1(float %a, float %b, float %c, float %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
@@ -30,10 +30,10 @@ entry:
 define void @test1_vec(float %a, float %b, float %c, float %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test1_vec(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -54,10 +54,10 @@ entry:
 define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 1)
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -79,10 +79,10 @@ entry:
 
 define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, ptr nocapture %4) {
 ; CHECK-LABEL: @test2_vec(
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i64 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], splat (i32 1)
 ; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 362ce9e460c24..9e4f10ec7b349 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -8,12 +8,13 @@
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], splat (i32 15)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP8]], <4 x i32> splat (i32 255))
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], splat (i32 255)
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> splat (i32 255)
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %0, align 4, !tbaa !2
@@ -50,13 +51,14 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], splat (i32 15)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP9]], <4 x i32> splat (i32 255))
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc nuw <4 x i32> [[TMP10]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP11]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], splat (i32 255)
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> splat (i32 255)
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i8>
+; CHECK-NEXT:    store <4 x i8> [[TMP12]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i8, ptr %0, align 1, !tbaa !6
@@ -101,7 +103,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i64(
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index 406d2e660dc9f..1b11c3dcc081c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -13,14 +13,14 @@ define void @foo() {
 ; SSE-LABEL: @foo(
 ; SSE-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
 ; SSE-NEXT:    store i32 [[TMP1]], ptr @a, align 16
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 4), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 8), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 12), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 16), align 16
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 20), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 24), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 28), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @foo(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 9273333f1d6b4..db38a62017391 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -7,83 +7,98 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
 ; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
 ; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
-; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
-; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
-; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
 ; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index bd6dbdeafffc3..bfa3610804967 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -7,83 +7,98 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
 ; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
 ; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
-; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
-; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
-; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
 ; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index b4b6f43d77785..a4949bc67b0f1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,10 +6,10 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i64 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -28,7 +28,7 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 
 define <4 x i32> @bar(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
index 4623db5ce7060..bada001ebbc6c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
@@ -5,8 +5,9 @@
 
 define dso_local <4 x float> @foo(<4 x i32> %0) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0:%.*]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = extractelement <4 x i32> %0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index c38ca3f7ee24b..cec99c694391b 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -19,8 +19,14 @@ define void @test1(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test1(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
-; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64>
-; COMBINE-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
+; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -61,9 +67,14 @@ define void @test2(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test2(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
-; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64>
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
+; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT:%.*]], align 8
+; COMBINE-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -99,8 +110,11 @@ define void @test3(<16 x i32> %0, ptr %out) {
 ;
 ; COMBINE-LABEL: @test3(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
-; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0)
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -134,8 +148,12 @@ define void @test4(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test4(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
-; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -165,7 +183,11 @@ define void @test5(ptr %out) {
 ;
 ; COMBINE-LABEL: @test5(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0
+; COMBINE-NEXT:    store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -219,28 +241,37 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ;
 ; COMBINE-LABEL: @test6(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32
+; COMBINE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32
 ; COMBINE-NEXT:    [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1
-; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float>
+; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP4:%.*]] = zext <32 x i8> [[TMP11]] to <32 x i16>
+; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float>
 ; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP3]], <4 x float> [[LOAD2]], i64 8)
 ; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
-; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32
-; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128
+; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
+; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128
 ; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float>
-; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP13:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0)
+; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; COMBINE-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
+; COMBINE-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0)
+; COMBINE-NEXT:    [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 0)
+; COMBINE-NEXT:    [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4)
+; COMBINE-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 4)
+; COMBINE-NEXT:    [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8)
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[TMP17:%.*]] = fmul <16 x float> [[TMP16]], [[TMP9]]
 ; COMBINE-NEXT:    store <16 x float> [[TMP17]], ptr [[GEP11]], align 16
@@ -353,7 +384,21 @@ define i32 @test7() {
 ;
 ; COMBINE-LABEL: @test7(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    store <16 x float> poison, ptr null, align 16
+; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]]
+; COMBINE-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]]
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4)
+; COMBINE-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
+; COMBINE-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]]
+; COMBINE-NEXT:    [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]]
+; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]]
+; COMBINE-NEXT:    store <16 x float> [[TMP13]], ptr null, align 16
 ; COMBINE-NEXT:    ret i32 0
 ;
 entry:

>From 69d9c3cc8408d2edb760f0fd18c1cc18808a5f55 Mon Sep 17 00:00:00 2001
From: ericxu233 <xuhanyang5 at gmail.com>
Date: Sun, 29 Jun 2025 15:50:16 -0400
Subject: [PATCH 3/3] Remove dce as well for dce,instcombine instances and
 added a note for removing dead code

---
 llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll  | 4 ++--
 llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll | 4 ++--
 llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll      | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 0a8ad26c900f8..89e133bb1c6a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 86d670551c10b..ac476c521a591 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 04bd969da6304..6c5220d13b7a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -420,6 +420,7 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; TODO: Dead code must be removed below.
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
@@ -1015,6 +1016,7 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; TODO: Dead code must be removed below.
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]