[llvm-branch-commits] [llvm] DAG: Handle load in SimplifyDemandedVectorElts (PR #122671)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Feb 4 01:32:57 PST 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122671

>From b745947e5ec3def08a4d9d99607a2f39a342ff89 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 13 Jan 2025 11:22:55 +0700
Subject: [PATCH] DAG: Handle load in SimplifyDemandedVectorElts

This improves some AMDGPU cases and avoids future regressions.
The combiner likes to form shuffles for cases where an extract_vector_elt
would do perfectly well, and this recovers some of the regressions from
losing load narrowing.

AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have
some improvements, but mostly regressions. In particular X86 looks much
worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong.

I mostly just regenerated the checks. I assume some set of them should
switch to use volatile loads to defeat the optimization.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   31 +
 .../AArch64/arm64-big-endian-bitconverts.ll   |   54 +-
 .../AArch64/dag-ReplaceAllUsesOfValuesWith.ll |    5 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |   43 +-
 llvm/test/CodeGen/AArch64/fmlal-loreg.ll      |    8 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |   17 +-
 .../sve-fixed-length-extract-vector-elt.ll    |  114 +-
 .../AArch64/sve-fixed-length-masked-gather.ll |    3 +-
 .../sve-fixed-length-masked-scatter.ll        |   10 +-
 ...ng-mode-fixed-length-extract-vector-elt.ll |   30 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |   39 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll     |   38 +-
 .../AMDGPU/greedy-reverse-local-assignment.ll |   21 +-
 .../identical-subrange-spill-infloop.ll       |    6 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   24 +-
 .../AMDGPU/shader-addr64-nonuniform.ll        |    8 +-
 llvm/test/CodeGen/AMDGPU/trunc.ll             |    4 +-
 .../test/CodeGen/AMDGPU/vector_rebroadcast.ll | 2657 +++++++++--------
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |  250 +-
 .../ARM/crash-on-pow2-shufflevector.ll        |    5 +-
 llvm/test/CodeGen/ARM/vector-promotion.ll     |   30 +-
 llvm/test/CodeGen/ARM/vext.ll                 |    7 +-
 llvm/test/CodeGen/ARM/vuzp.ll                 |   14 +-
 llvm/test/CodeGen/Mips/cconv/vector.ll        |   65 +-
 .../test/CodeGen/Mips/msa/basic_operations.ll |   96 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |   10 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |   48 +-
 .../PowerPC/aix-vector-byval-callee.ll        |    4 +-
 .../PowerPC/canonical-merge-shuffles.ll       |    9 +-
 llvm/test/CodeGen/PowerPC/const-stov.ll       |   15 +-
 llvm/test/CodeGen/PowerPC/pr27078.ll          |   22 +-
 llvm/test/CodeGen/PowerPC/pre-inc-disable.ll  |   28 +-
 .../PowerPC/v16i8_scalar_to_vector_shuffle.ll |  120 +-
 .../PowerPC/v2i64_scalar_to_vector_shuffle.ll |  212 +-
 .../PowerPC/v8i16_scalar_to_vector_shuffle.ll |    6 +-
 llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll   |  104 +-
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |   27 +-
 llvm/test/CodeGen/Thumb2/mve-extractstore.ll  |   24 +-
 .../CodeGen/Thumb2/mve-insertshuffleload.ll   |   48 +-
 llvm/test/CodeGen/X86/SwizzleShuff.ll         |    2 +-
 llvm/test/CodeGen/X86/avx-vbroadcast.ll       |    6 +-
 llvm/test/CodeGen/X86/avx.ll                  |    6 +-
 .../CodeGen/X86/avx1-logical-load-folding.ll  |   28 +-
 llvm/test/CodeGen/X86/avx512-arith.ll         |    4 +-
 .../CodeGen/X86/avx512-broadcast-arith.ll     |   12 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll  |    4 +-
 llvm/test/CodeGen/X86/avx512-cmp.ll           |    2 +-
 llvm/test/CodeGen/X86/avx512-ext.ll           |   18 +-
 .../avx512-extract-subvector-load-store.ll    |   12 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |   25 +-
 llvm/test/CodeGen/X86/avx512-load-store.ll    |   32 +-
 llvm/test/CodeGen/X86/avx512-logic.ll         |   14 +-
 llvm/test/CodeGen/X86/avx512-select.ll        |    4 +-
 .../X86/avx512-shuffles/partial_permute.ll    |  802 ++---
 .../X86/avx512-shuffles/shuffle-interleave.ll |   31 +-
 .../CodeGen/X86/avx512-shuffles/unpack.ll     |   40 +-
 llvm/test/CodeGen/X86/avx512fp16-mov.ll       |    8 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |   10 +-
 llvm/test/CodeGen/X86/buildvec-insertvec.ll   |   16 +-
 llvm/test/CodeGen/X86/combine-fabs.ll         |    9 +-
 llvm/test/CodeGen/X86/combine-sdiv.ll         |   12 +-
 llvm/test/CodeGen/X86/combine-udiv.ll         |    8 +-
 llvm/test/CodeGen/X86/commute-blend-avx2.ll   |    6 +-
 llvm/test/CodeGen/X86/commute-blend-sse41.ll  |   12 +-
 .../X86/copysign-constant-magnitude.ll        |   24 +-
 llvm/test/CodeGen/X86/extract-concat.ll       |   10 +-
 llvm/test/CodeGen/X86/extractelement-fp.ll    |    7 +-
 llvm/test/CodeGen/X86/extractelement-load.ll  |   59 +-
 llvm/test/CodeGen/X86/fabs.ll                 |    3 +-
 llvm/test/CodeGen/X86/fast-isel-fneg.ll       |    5 +-
 llvm/test/CodeGen/X86/fma-signed-zero.ll      |    6 +-
 llvm/test/CodeGen/X86/fp-fold.ll              |   27 +-
 llvm/test/CodeGen/X86/fp-intrinsics-fma.ll    |   42 +-
 llvm/test/CodeGen/X86/fp-logic.ll             |   12 +-
 llvm/test/CodeGen/X86/fp-round.ll             |   57 +-
 llvm/test/CodeGen/X86/fp128-cast.ll           |    3 +-
 llvm/test/CodeGen/X86/fp16-libcalls.ll        |   12 +-
 llvm/test/CodeGen/X86/freeze-vector.ll        |    8 +-
 llvm/test/CodeGen/X86/gfni-funnel-shifts.ll   |   50 +-
 llvm/test/CodeGen/X86/half.ll                 |   14 +-
 .../X86/insert-into-constant-vector.ll        |   52 +-
 llvm/test/CodeGen/X86/insertps-combine.ll     |   12 +-
 .../CodeGen/X86/insertps-from-constantpool.ll |    6 +-
 .../CodeGen/X86/insertps-unfold-load-bug.ll   |    4 +-
 llvm/test/CodeGen/X86/is_fpclass.ll           |   10 +-
 .../X86/isel-blendi-gettargetconstant.ll      |    7 +-
 llvm/test/CodeGen/X86/load-partial.ll         |    4 -
 llvm/test/CodeGen/X86/masked_load.ll          |    3 +-
 llvm/test/CodeGen/X86/masked_store.ll         |   15 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |   11 +-
 llvm/test/CodeGen/X86/neg_fp.ll               |    5 +-
 llvm/test/CodeGen/X86/negative-sin.ll         |    3 +-
 llvm/test/CodeGen/X86/packus.ll               |   60 +-
 llvm/test/CodeGen/X86/peephole-fold-movsd.ll  |    2 +-
 llvm/test/CodeGen/X86/pr14161.ll              |    3 +-
 llvm/test/CodeGen/X86/pr30511.ll              |    5 +-
 llvm/test/CodeGen/X86/pr31956.ll              |    9 +-
 llvm/test/CodeGen/X86/pr34592.ll              |   36 +-
 llvm/test/CodeGen/X86/pr36553.ll              |    3 +-
 llvm/test/CodeGen/X86/pr40811.ll              |    9 +-
 llvm/test/CodeGen/X86/pr63091.ll              |    7 +-
 llvm/test/CodeGen/X86/sar_fold64.ll           |   16 +-
 llvm/test/CodeGen/X86/setcc-combine.ll        |    6 +-
 .../test/CodeGen/X86/setcc-non-simple-type.ll |   26 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   28 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll |   24 +-
 llvm/test/CodeGen/X86/splat-for-size.ll       |    4 +-
 llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll   |    3 +-
 .../CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll |    6 +-
 llvm/test/CodeGen/X86/sqrt-fastmath.ll        |   10 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |  138 +-
 llvm/test/CodeGen/X86/sse-align-12.ll         |    4 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   85 +-
 llvm/test/CodeGen/X86/sse3.ll                 |   26 +-
 llvm/test/CodeGen/X86/sse41.ll                |  184 +-
 llvm/test/CodeGen/X86/strict-fsub-combines.ll |   28 +-
 llvm/test/CodeGen/X86/subvector-broadcast.ll  |    4 +-
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |    7 +-
 .../X86/tuning-shuffle-unpckpd-avx512.ll      |  189 +-
 .../CodeGen/X86/tuning-shuffle-unpckpd.ll     |   52 +-
 .../X86/urem-seteq-vec-tautological.ll        |    6 +-
 llvm/test/CodeGen/X86/vec_insert-5.ll         |   59 +-
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        |  120 +-
 llvm/test/CodeGen/X86/vec_shift5.ll           |   28 +-
 llvm/test/CodeGen/X86/vector-bitreverse.ll    |    5 +-
 .../vector-constrained-fp-intrinsics-flags.ll |   28 +-
 .../X86/vector-constrained-fp-intrinsics.ll   |   21 +-
 llvm/test/CodeGen/X86/vector-fshl-256.ll      |   44 +-
 llvm/test/CodeGen/X86/vector-fshl-512.ll      |   29 +-
 llvm/test/CodeGen/X86/vector-fshr-256.ll      |    4 +
 llvm/test/CodeGen/X86/vector-fshr-512.ll      |   29 +-
 .../CodeGen/X86/vector-reduce-fmax-nnan.ll    |   28 +-
 llvm/test/CodeGen/X86/vector-reduce-fmin.ll   |    4 +-
 llvm/test/CodeGen/X86/vector-rotate-128.ll    |   30 +-
 llvm/test/CodeGen/X86/vector-rotate-256.ll    |    4 +-
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |   54 +-
 .../test/CodeGen/X86/vector-shift-ashr-256.ll |   31 +-
 .../test/CodeGen/X86/vector-shift-ashr-512.ll |    3 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |   40 +-
 .../test/CodeGen/X86/vector-shift-lshr-256.ll |   31 +-
 .../test/CodeGen/X86/vector-shift-lshr-512.ll |    3 +-
 llvm/test/CodeGen/X86/vector-shift-shl-128.ll |   40 +-
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll |   33 +-
 llvm/test/CodeGen/X86/vector-shift-shl-512.ll |    3 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |   30 +-
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |   71 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   19 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |   11 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |  132 +-
 llvm/test/CodeGen/X86/vector-shuffle-v1.ll    |    6 +-
 llvm/test/CodeGen/X86/vector-shuffle-v192.ll  |   16 +-
 llvm/test/CodeGen/X86/vector-shuffle-v48.ll   |    6 +-
 llvm/test/CodeGen/X86/vselect.ll              |   30 +-
 llvm/test/CodeGen/X86/widened-broadcast.ll    |   62 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |   35 +-
 llvm/test/CodeGen/X86/xop-shifts.ll           |    7 +-
 llvm/test/CodeGen/X86/xor.ll                  |   14 +-
 157 files changed, 4071 insertions(+), 3869 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index adfb96041c5c06b..d19495c3abad3f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3479,6 +3479,37 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     break;
   }
+  case ISD::LOAD: {
+    auto *Ld = cast<LoadSDNode>(Op);
+    if (!ISD::isNormalLoad(Ld) || !Ld->isSimple())
+      break;
+
+    // TODO: Handle arbitrary vector extract for isMask
+    if (DemandedElts.popcount() != 1)
+      break;
+
+    EVT VT = Ld->getValueType(0);
+    if (TLO.LegalOperations() &&
+        !isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+      break;
+
+    EVT EltVT = VT.getVectorElementType();
+    SDLoc DL(Ld);
+
+    unsigned Idx = DemandedElts.countTrailingZeros();
+
+    SDValue IdxVal = TLO.DAG.getVectorIdxConstant(Idx, DL);
+    SDValue Scalarized =
+        scalarizeExtractedVectorLoad(EltVT, DL, VT, IdxVal, Ld, TLO.DAG);
+    if (!Scalarized)
+      break;
+
+    TLO.DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Scalarized.getValue(1));
+
+    SDValue Insert = TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+                                     TLO.DAG.getUNDEF(VT), Scalarized, IdxVal);
+    return TLO.CombineTo(Op, Insert);
+  }
   case ISD::VECTOR_SHUFFLE: {
     SDValue LHS = Op.getOperand(0);
     SDValue RHS = Op.getOperand(1);
diff --git a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f5aa4c666a56819..e9a4a83a4068388 100644
--- a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -30,7 +30,7 @@ define void @test_i64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to i64
     %4 = add i64 %3, %3
@@ -43,7 +43,7 @@ define void @test_i64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to i64
     %4 = add i64 %3, %3
@@ -121,7 +121,7 @@ define void @test_f64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to double
     %4 = fadd double %3, %3
@@ -134,7 +134,7 @@ define void @test_f64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to double
     %4 = fadd double %3, %3
@@ -213,7 +213,7 @@ define void @test_v1i64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <1 x i64>
     %4 = add <1 x i64> %3, %3
@@ -226,7 +226,7 @@ define void @test_v1i64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <1 x i64>
     %4 = add <1 x i64> %3, %3
@@ -318,7 +318,7 @@ define void @test_v2f32_v1i64(ptr %p, ptr %q) {
 define void @test_v2f32_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <2 x float>
     %4 = fadd <2 x float> %3, %3
@@ -410,7 +410,7 @@ define void @test_v2i32_v1i64(ptr %p, ptr %q) {
 define void @test_v2i32_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <2 x i32>
     %4 = add <2 x i32> %3, %3
@@ -488,7 +488,7 @@ define void @test_v4i16_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.4h
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <4 x i16>
     %4 = add <4 x i16> %3, %3
@@ -501,7 +501,7 @@ define void @test_v4i16_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.4h
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <4 x i16>
     %4 = add <4 x i16> %3, %3
@@ -587,7 +587,7 @@ define void @test_v4f16_v2f32(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <4 x half>
     %4 = fadd <4 x half> %3, %3
@@ -602,7 +602,7 @@ define void @test_v4f16_v2i32(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <4 x half>
     %4 = fadd <4 x half> %3, %3
@@ -682,7 +682,7 @@ define void @test_v8i8_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.8b
 ; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <8 x i8>
     %4 = add <8 x i8> %3, %3
@@ -695,7 +695,7 @@ define void @test_v8i8_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.8b
 ; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <8 x i8>
     %4 = add <8 x i8> %3, %3
@@ -721,7 +721,7 @@ define void @test_f128_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: ext
 ; CHECK: str
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to fp128
     %4 = fadd fp128 %3, %3
@@ -734,7 +734,7 @@ define void @test_f128_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: ext
 ; CHECK: str
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to fp128
     %4 = fadd fp128 %3, %3
@@ -816,7 +816,7 @@ define void @test_v2f64_f128(ptr %p, ptr %q) {
 define void @test_v2f64_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <2 x double>
     %4 = fadd <2 x double> %3, %3
@@ -895,7 +895,7 @@ define void @test_v2i64_f128(ptr %p, ptr %q) {
 define void @test_v2i64_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <2 x i64>
     %4 = add <2 x i64> %3, %3
@@ -979,7 +979,7 @@ define void @test_v4f32_v2f64(ptr %p, ptr %q) {
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <4 x float>
     %4 = fadd <4 x float> %3, %3
@@ -994,7 +994,7 @@ define void @test_v4f32_v2i64(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <4 x float>
     %4 = fadd <4 x float> %3, %3
@@ -1062,7 +1062,7 @@ define void @test_v4i32_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <4 x i32>
     %4 = add <4 x i32> %3, %3
@@ -1075,7 +1075,7 @@ define void @test_v4i32_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <4 x i32>
     %4 = add <4 x i32> %3, %3
@@ -1141,7 +1141,7 @@ define void @test_v8i16_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.8h
 ; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <8 x i16>
     %4 = add <8 x i16> %3, %3
@@ -1154,7 +1154,7 @@ define void @test_v8i16_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.8h
 ; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <8 x i16>
     %4 = add <8 x i16> %3, %3
@@ -1234,7 +1234,7 @@ define void @test_v16i8_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.16b
 ; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <16 x i8>
     %4 = add <16 x i8> %3, %3
@@ -1247,7 +1247,7 @@ define void @test_v16i8_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.16b
 ; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <16 x i8>
     %4 = add <16 x i8> %3, %3
@@ -1315,7 +1315,7 @@ define %struct.struct1 @test_v4f16_struct(ptr %ret) {
 entry:
 ; CHECK: ld1 { {{v[0-9]+}}.4h }
 ; CHECK-NOT: rev
-  %0 = load <4 x half>, ptr %ret, align 2
+  %0 = load volatile <4 x half>, ptr %ret, align 2
   %1 = extractelement <4 x half> %0, i32 0
   %.fca.0.insert = insertvalue %struct.struct1 undef, half %1, 0
   ret %struct.struct1 %.fca.0.insert
diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
index d76e817e62a495d..ce657aa1f0b5bc5 100644
--- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
+++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
@@ -27,10 +27,7 @@
 define i64 @g(ptr %p) {
 ; CHECK-LABEL: g:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    add x9, x8, x8
-; CHECK-NEXT:    add x8, x9, x8
-; CHECK-NEXT:    sub x0, x8, x8
+; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret
   %vec = load <2 x i64>, ptr %p, align 1
   %elt = extractelement <2 x i64> %vec, i32 1
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d859733..d39e537edb7861f 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -679,28 +679,27 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
 ; CHECK-SD-NEXT:    .cfi_offset w30, -16
 ; CHECK-SD-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    add x8, sp, #176
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-SD-NEXT:    ldr d5, [sp, #184]
-; CHECK-SD-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp d3, d2, [sp, #160]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
 ; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    mov v0.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
-; CHECK-SD-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldr d5, [sp, #160]
-; CHECK-SD-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-SD-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    ld1 { v2.d }[1], [x8]
+; CHECK-SD-NEXT:    stp q6, q3, [sp, #80] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    str q2, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr d2, [sp, #184]
+; CHECK-SD-NEXT:    str q2, [sp, #64] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    bl __lttf2
 ; CHECK-SD-NEXT:    cmp w0, #0
-; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    cset w8, lt
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-SD-NEXT:    fmov d0, x8
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    bl __lttf2
 ; CHECK-SD-NEXT:    cmp w0, #0
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -708,19 +707,19 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-SD-NEXT:    fmov d1, x8
 ; CHECK-SD-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-SD-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    bl __lttf2
-; CHECK-SD-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q0, q3, [sp, #80] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    cmp w0, #0
-; CHECK-SD-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    cset w8, lt
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-SD-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q4, [sp, #64] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    fmov d2, x8
-; CHECK-SD-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    bsl v2.16b, v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -815,20 +814,20 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    add x8, sp, #16
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-SD-NEXT:    ldr d16, [sp, #24]
-; CHECK-SD-NEXT:    ldr d17, [sp]
 ; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
 ; CHECK-SD-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    ldp d3, d1, [sp]
+; CHECK-SD-NEXT:    ld1 { v1.d }[1], [x8]
 ; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ldr d1, [sp, #24]
+; CHECK-SD-NEXT:    bsl v2.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 31ead890ba8ac7d..ed22243eeef45fc 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -45,11 +45,11 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
 ; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:  .LBB1_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q2, [x1], #2
+; CHECK-NEXT:    ldr q2, [x2], #2
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    ldr q3, [x2], #2
-; CHECK-NEXT:    fmlal v0.4s, v3.4h, v2.h[0]
-; CHECK-NEXT:    fmlal2 v1.4s, v3.4h, v2.h[0]
+; CHECK-NEXT:    ld1r { v3.8h }, [x1], #2
+; CHECK-NEXT:    fmlal v0.4s, v2.4h, v3.4h
+; CHECK-NEXT:    fmlal2 v1.4s, v2.4h, v3.4h
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index e284795760c5cad..f586647439d2558 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1123,30 +1123,29 @@ entry:
 define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> %e) {
 ; CHECK-SD-LABEL: v3i64_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    add x8, sp, #16
 ; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    ldr d16, [sp, #24]
-; CHECK-SD-NEXT:    ldr d17, [sp]
 ; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
-; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-SD-NEXT:    ldp d4, d1, [sp]
+; CHECK-SD-NEXT:    ld1 { v1.d }[1], [x8]
 ; CHECK-SD-NEXT:    cmgt v0.2d, v3.2d, v0.2d
 ; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
 ; CHECK-SD-NEXT:    cmgt v1.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT:    mov v2.16b, v1.16b
+; CHECK-SD-NEXT:    ldr d2, [sp, #24]
+; CHECK-SD-NEXT:    bit v2.16b, v4.16b, v1.16b
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: v3i64_i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index ad4efeaf39247a3..1e6427c4cd49563 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -33,10 +33,7 @@ define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
 define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.h, z0.h[15]
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ldr h0, [x0, #30]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = extractelement <16 x half> %op1, i64 15
@@ -44,22 +41,10 @@ define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 }
 
 define half @extractelement_v32f16(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v32f16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.h, z0.h[31]
-; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #62]
+; CHECK-NEXT:    ret
     %op1 = load <32 x half>, ptr %a
     %r = extractelement <32 x half> %op1, i64 31
     ret half %r
@@ -68,11 +53,7 @@ define half @extractelement_v32f16(ptr %a) #0 {
 define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ldr h0, [x0, #126]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x half>, ptr %a
     %r = extractelement <64 x half> %op1, i64 63
@@ -82,11 +63,7 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ldr h0, [x0, #254]
 ; CHECK-NEXT:    ret
     %op1 = load <128 x half>, ptr %a
     %r = extractelement <128 x half> %op1, i64 127
@@ -117,10 +94,7 @@ define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
 define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, z0.s[7]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ldr s0, [x0, #28]
 ; CHECK-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = extractelement <8 x float> %op1, i64 7
@@ -128,22 +102,10 @@ define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 }
 
 define float @extractelement_v16f32(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v16f32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.s, z0.s[15]
-; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, #60]
+; CHECK-NEXT:    ret
     %op1 = load <16 x float>, ptr %a
     %r = extractelement <16 x float> %op1, i64 15
     ret float %r
@@ -152,11 +114,7 @@ define float @extractelement_v16f32(ptr %a) #0 {
 define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ldr s0, [x0, #124]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x float>, ptr %a
     %r = extractelement <32 x float> %op1, i64 31
@@ -166,11 +124,7 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ldr s0, [x0, #252]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x float>, ptr %a
     %r = extractelement <64 x float> %op1, i64 63
@@ -199,10 +153,7 @@ define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
 define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, z0.d[3]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ldr d0, [x0, #24]
 ; CHECK-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = extractelement <4 x double> %op1, i64 3
@@ -210,22 +161,10 @@ define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 }
 
 define double @extractelement_v8f64(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v8f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v8f64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.d, z0.d[7]
-; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #56]
+; CHECK-NEXT:    ret
     %op1 = load <8 x double>, ptr %a
     %r = extractelement <8 x double> %op1, i64 7
     ret double %r
@@ -234,11 +173,7 @@ define double @extractelement_v8f64(ptr %a) #0 {
 define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ldr d0, [x0, #120]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x double>, ptr %a
     %r = extractelement <16 x double> %op1, i64 15
@@ -248,11 +183,7 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ldr d0, [x0, #248]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x double>, ptr %a
     %r = extractelement <32 x double> %op1, i64 31
@@ -260,3 +191,6 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 }
 
 attributes #0 = { "target-features"="+sve" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VBITS_GE_256: {{.*}}
+; VBITS_GE_512: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 27e95489f8ad7a5..5233d292c4eaf42 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    cbnz x8, .LBB15_2
 ; CHECK-NEXT:  // %bb.1: // %cond.load
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index e3e06dcdf17f306..5af3a88c711bd12 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -415,13 +415,13 @@ define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    cbnz x8, .LBB15_2
 ; CHECK-NEXT:  // %bb.1: // %cond.store
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    str d0, [x9]
 ; CHECK-NEXT:  .LBB15_2: // %else
 ; CHECK-NEXT:    ret
   %vals = load <1 x i64>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index cf308e6c4395ffe..f0e5fa6e03090f6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -71,18 +71,12 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 define half @extractelement_v16f16(ptr %a) {
 ; CHECK-LABEL: extractelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.h, z0.h[7]
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ldr h0, [x0, #30]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
@@ -131,18 +125,12 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 define float @extractelement_v8f32(ptr %a) {
 ; CHECK-LABEL: extractelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.s, z0.s[3]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ldr s0, [x0, #28]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
@@ -182,18 +170,12 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 define double @extractelement_v4f64(ptr %a) {
 ; CHECK-LABEL: extractelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ldr d0, [x0, #24]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fab45c9dc3bc3c1..6ffe8c5fab29f81 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -880,44 +880,43 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xe
+; SI-NEXT:    s_brev_b32 s5, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_bfi_b32 v0, s5, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x38
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s0, -2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x38
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s3, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 5f75a2f29a026f3..ad126bb22b5831e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -13,14 +13,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x1d
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x1e
+; SI-NEXT:    s_brev_b32 s5, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_bfi_b32 v1, s5, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -28,32 +28,32 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
 ; VI-LABEL: s_test_copysign_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x74
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x78
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s2, -2
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_bfi_b32 v1, s4, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x74
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x4c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x78
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s3, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
index 6c921441c972d3a..ea2fe9d6208938c 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
@@ -19,31 +19,30 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; FORWARDXNACK-LABEL: shuffle_v4f16_234u:
 ; FORWARDXNACK:       ; %bb.0:
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FORWARDXNACK-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; FORWARDXNACK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; FORWARDXNACK-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; FORWARDXNACK-NEXT:    global_load_dword v5, v[2:3], off
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(1)
-; FORWARDXNACK-NEXT:    v_mov_b32_e32 v0, v6
+; FORWARDXNACK-NEXT:    v_mov_b32_e32 v0, v4
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0)
-; FORWARDXNACK-NEXT:    v_mov_b32_e32 v1, v4
+; FORWARDXNACK-NEXT:    v_mov_b32_e32 v1, v5
 ; FORWARDXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
 ; REVERSEXNACK:       ; %bb.0:
 ; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v6, v1
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v5, v0
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v4, v3
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v3, v2
-; REVERSEXNACK-NEXT:    global_load_dword v0, v[5:6], off offset:4
-; REVERSEXNACK-NEXT:    global_load_dwordx2 v[1:2], v[3:4], off
+; REVERSEXNACK-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; REVERSEXNACK-NEXT:    global_load_dword v4, v[2:3], off
+; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(1)
+; REVERSEXNACK-NEXT:    v_mov_b32_e32 v0, v5
 ; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0)
+; REVERSEXNACK-NEXT:    v_mov_b32_e32 v1, v4
 ; REVERSEXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOXNACK-LABEL: shuffle_v4f16_234u:
 ; NOXNACK:       ; %bb.0:
 ; NOXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOXNACK-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; NOXNACK-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; NOXNACK-NEXT:    global_load_dword v1, v[2:3], off
 ; NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; NOXNACK-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5dff660912e4025..a656ce2fa9d71f5 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -34,7 +34,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s22, s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s20, 0x130
 ; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -55,7 +55,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
-; CHECK-NEXT:    v_mov_b32_e32 v2, s28
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
@@ -318,8 +318,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
 ; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
+; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 3d27b5fe7f30b34..5b96fb06afbbf41 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -112,10 +112,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s1, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
@@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0xcc
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
@@ -166,10 +166,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-LABEL: llvm_amdgcn_is_private:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s1, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
@@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_is_private:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0xc8
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index 363d568f9c11c95..f06175d1adaec3e 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -16,7 +16,7 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 .entry:
   %tmp31 = sext i32 %arg18 to i64
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -31,7 +31,7 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -46,7 +46,7 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -61,7 +61,7 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index db802732e987b51..87a21a46eaff510 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -93,8 +93,8 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
 }
 
 ; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
+; SI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bitcmp1_b32 s[[SLO]], 0
 ; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index b079a94b5fcc3dd..fc0740ab2693e9b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
@@ -5,31 +6,31 @@
 define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
@@ -39,37 +40,37 @@ entry:
 define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -79,49 +80,49 @@ entry:
 define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -131,73 +132,73 @@ entry:
 define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -207,121 +208,121 @@ entry:
 define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -331,28 +332,28 @@ entry:
 define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1>
@@ -362,32 +363,32 @@ entry:
 define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -397,38 +398,38 @@ entry:
 define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -438,50 +439,50 @@ entry:
 define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -491,74 +492,74 @@ entry:
 define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -568,27 +569,27 @@ entry:
 define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -598,33 +599,33 @@ entry:
 define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -634,45 +635,45 @@ entry:
 define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -682,69 +683,69 @@ entry:
 define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -754,117 +755,117 @@ entry:
 define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -874,28 +875,28 @@ entry:
 define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
@@ -905,31 +906,31 @@ entry:
 define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -939,32 +940,32 @@ entry:
 define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -974,35 +975,35 @@ entry:
 define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1012,38 +1013,38 @@ entry:
 define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1053,50 +1054,50 @@ entry:
 define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1106,74 +1107,74 @@ entry:
 define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1183,28 +1184,28 @@ entry:
 define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1>
@@ -1214,31 +1215,31 @@ entry:
 define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1248,32 +1249,32 @@ entry:
 define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1283,35 +1284,35 @@ entry:
 define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1321,38 +1322,38 @@ entry:
 define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1362,50 +1363,50 @@ entry:
 define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1415,74 +1416,74 @@ entry:
 define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1492,27 +1493,27 @@ entry:
 define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -1522,30 +1523,30 @@ entry:
 define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b96 v[0:2], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1555,33 +1556,33 @@ entry:
 define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1591,39 +1592,39 @@ entry:
 define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1633,45 +1634,45 @@ entry:
 define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1681,69 +1682,69 @@ entry:
 define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1753,117 +1754,117 @@ entry:
 define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  v_mov_b32_e32 v16, v1
-; GFX9-NEXT:  v_mov_b32_e32 v17, v1
-; GFX9-NEXT:  v_mov_b32_e32 v18, v1
-; GFX9-NEXT:  v_mov_b32_e32 v19, v1
-; GFX9-NEXT:  v_mov_b32_e32 v20, v1
-; GFX9-NEXT:  v_mov_b32_e32 v21, v1
-; GFX9-NEXT:  v_mov_b32_e32 v22, v1
-; GFX9-NEXT:  v_mov_b32_e32 v23, v1
-; GFX9-NEXT:  v_mov_b32_e32 v24, v1
-; GFX9-NEXT:  v_mov_b32_e32 v25, v1
-; GFX9-NEXT:  v_mov_b32_e32 v26, v1
-; GFX9-NEXT:  v_mov_b32_e32 v27, v1
-; GFX9-NEXT:  v_mov_b32_e32 v28, v1
-; GFX9-NEXT:  v_mov_b32_e32 v29, v1
-; GFX9-NEXT:  v_mov_b32_e32 v30, v1
-; GFX9-NEXT:  v_mov_b32_e32 v31, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-NEXT:    v_mov_b32_e32 v19, v1
+; GFX9-NEXT:    v_mov_b32_e32 v20, v1
+; GFX9-NEXT:    v_mov_b32_e32 v21, v1
+; GFX9-NEXT:    v_mov_b32_e32 v22, v1
+; GFX9-NEXT:    v_mov_b32_e32 v23, v1
+; GFX9-NEXT:    v_mov_b32_e32 v24, v1
+; GFX9-NEXT:    v_mov_b32_e32 v25, v1
+; GFX9-NEXT:    v_mov_b32_e32 v26, v1
+; GFX9-NEXT:    v_mov_b32_e32 v27, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v1
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v30, v1
+; GFX9-NEXT:    v_mov_b32_e32 v31, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  v_mov_b32_e32 v16, v1
-; GFX10-NEXT:  v_mov_b32_e32 v17, v1
-; GFX10-NEXT:  v_mov_b32_e32 v18, v1
-; GFX10-NEXT:  v_mov_b32_e32 v19, v1
-; GFX10-NEXT:  v_mov_b32_e32 v20, v1
-; GFX10-NEXT:  v_mov_b32_e32 v21, v1
-; GFX10-NEXT:  v_mov_b32_e32 v22, v1
-; GFX10-NEXT:  v_mov_b32_e32 v23, v1
-; GFX10-NEXT:  v_mov_b32_e32 v24, v1
-; GFX10-NEXT:  v_mov_b32_e32 v25, v1
-; GFX10-NEXT:  v_mov_b32_e32 v26, v1
-; GFX10-NEXT:  v_mov_b32_e32 v27, v1
-; GFX10-NEXT:  v_mov_b32_e32 v28, v1
-; GFX10-NEXT:  v_mov_b32_e32 v29, v1
-; GFX10-NEXT:  v_mov_b32_e32 v30, v1
-; GFX10-NEXT:  v_mov_b32_e32 v31, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v18, v1
+; GFX10-NEXT:    v_mov_b32_e32 v19, v1
+; GFX10-NEXT:    v_mov_b32_e32 v20, v1
+; GFX10-NEXT:    v_mov_b32_e32 v21, v1
+; GFX10-NEXT:    v_mov_b32_e32 v22, v1
+; GFX10-NEXT:    v_mov_b32_e32 v23, v1
+; GFX10-NEXT:    v_mov_b32_e32 v24, v1
+; GFX10-NEXT:    v_mov_b32_e32 v25, v1
+; GFX10-NEXT:    v_mov_b32_e32 v26, v1
+; GFX10-NEXT:    v_mov_b32_e32 v27, v1
+; GFX10-NEXT:    v_mov_b32_e32 v28, v1
+; GFX10-NEXT:    v_mov_b32_e32 v29, v1
+; GFX10-NEXT:    v_mov_b32_e32 v30, v1
+; GFX10-NEXT:    v_mov_b32_e32 v31, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  v_mov_b32_e32 v16, v1
-; GFX11-NEXT:  v_mov_b32_e32 v17, v1
-; GFX11-NEXT:  v_mov_b32_e32 v18, v1
-; GFX11-NEXT:  v_mov_b32_e32 v19, v1
-; GFX11-NEXT:  v_mov_b32_e32 v20, v1
-; GFX11-NEXT:  v_mov_b32_e32 v21, v1
-; GFX11-NEXT:  v_mov_b32_e32 v22, v1
-; GFX11-NEXT:  v_mov_b32_e32 v23, v1
-; GFX11-NEXT:  v_mov_b32_e32 v24, v1
-; GFX11-NEXT:  v_mov_b32_e32 v25, v1
-; GFX11-NEXT:  v_mov_b32_e32 v26, v1
-; GFX11-NEXT:  v_mov_b32_e32 v27, v1
-; GFX11-NEXT:  v_mov_b32_e32 v28, v1
-; GFX11-NEXT:  v_mov_b32_e32 v29, v1
-; GFX11-NEXT:  v_mov_b32_e32 v30, v1
-; GFX11-NEXT:  v_mov_b32_e32 v31, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    v_mov_b32_e32 v16, v1
+; GFX11-NEXT:    v_mov_b32_e32 v17, v1
+; GFX11-NEXT:    v_mov_b32_e32 v18, v1
+; GFX11-NEXT:    v_mov_b32_e32 v19, v1
+; GFX11-NEXT:    v_mov_b32_e32 v20, v1
+; GFX11-NEXT:    v_mov_b32_e32 v21, v1
+; GFX11-NEXT:    v_mov_b32_e32 v22, v1
+; GFX11-NEXT:    v_mov_b32_e32 v23, v1
+; GFX11-NEXT:    v_mov_b32_e32 v24, v1
+; GFX11-NEXT:    v_mov_b32_e32 v25, v1
+; GFX11-NEXT:    v_mov_b32_e32 v26, v1
+; GFX11-NEXT:    v_mov_b32_e32 v27, v1
+; GFX11-NEXT:    v_mov_b32_e32 v28, v1
+; GFX11-NEXT:    v_mov_b32_e32 v29, v1
+; GFX11-NEXT:    v_mov_b32_e32 v30, v1
+; GFX11-NEXT:    v_mov_b32_e32 v31, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b85bd4c63466840..a00ee9ac0d5a9d8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -32,44 +32,33 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4f16_234u:
-; GX900:       ; %bb.0:
-; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_mov_b32_e32 v0, v6
-; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_mov_b32_e32 v1, v4
-; GX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: shuffle_v4f16_234u:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4f16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
@@ -320,47 +309,43 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_357u:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4f16_357u:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1018,34 +1003,31 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1057,12 +1039,11 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5634:
@@ -1233,7 +1214,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_0000:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT:    global_load_dword v0, v[0:1], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -1243,7 +1224,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX940-LABEL: shuffle_v4f16_0000:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
@@ -1253,7 +1234,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-LABEL: shuffle_v4f16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -1262,7 +1243,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX11-LABEL: shuffle_v4f16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1905,43 +1886,39 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_0456:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4f16_0456:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
-; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
@@ -3177,44 +3154,33 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 }
 
 define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4bf16_234u:
-; GX900:       ; %bb.0:
-; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_mov_b32_e32 v0, v6
-; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_mov_b32_e32 v1, v4
-; GX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: shuffle_v4bf16_234u:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3465,47 +3431,43 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_357u:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4bf16_357u:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4163,34 +4125,31 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4202,12 +4161,11 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5634:
@@ -4293,7 +4251,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_0000:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT:    global_load_dword v0, v[0:1], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -4303,7 +4261,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX940-LABEL: shuffle_v4bf16_0000:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
@@ -4313,7 +4271,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4322,7 +4280,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5235,43 +5193,39 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_0456:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4bf16_0456:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
-; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
index 8186f6c9b42fba8..695b0a796eb432b 100644
--- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
+++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
@@ -8,9 +8,10 @@
 define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr d16, [r0, #32]
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
 ; CHECK-NEXT:    vadd.i32 d16, d16, d16
-; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    vmov.32 r0, d16[1]
 ; CHECK-NEXT:    bx lr
 entry:
   %wide.vec = load <16 x i32>, ptr %descs, align 4
diff --git a/llvm/test/CodeGen/ARM/vector-promotion.ll b/llvm/test/CodeGen/ARM/vector-promotion.ll
index 344014ad8044953..c3889ccfec7dbdb 100644
--- a/llvm/test/CodeGen/ARM/vector-promotion.ll
+++ b/llvm/test/CodeGen/ARM/vector-promotion.ll
@@ -44,7 +44,7 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest)
 
 
 ; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
-; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load volatile <2 x i32>, ptr %addr1
 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 ; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
 ; BB2
@@ -58,10 +58,10 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest)
 ; ASM: bx
 define void @unsupportedChainInDifferentBBs(ptr %addr1, ptr %dest, i1 %bool) {
 bb1:
-  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %in1 = load volatile <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 0
   br i1 %bool, label %bb2, label %end
-bb2: 
+bb2:
   %out = or i32 %extract, 1
   store i32 %out, ptr %dest, align 4
   br label %end
@@ -150,7 +150,7 @@ define void @udivCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @uremCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -169,7 +169,7 @@ define void @uremCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @sdivCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -188,7 +188,7 @@ define void @sdivCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @sremCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -199,7 +199,7 @@ define void @sremCase(ptr %addr1, ptr %dest) {
 
 ; IR-BOTH-LABEL: @fdivCase
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
 ; Vector version:
@@ -209,7 +209,7 @@ define void @sremCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @fdivCase(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = fdiv float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -218,7 +218,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) {
 
 ; IR-BOTH-LABEL: @fremCase
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
 ; Vector version:
@@ -228,7 +228,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @fremCase(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -272,7 +272,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) {
 ; flag is set.
 ; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
 ; Vector version:
@@ -282,7 +282,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem nnan float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -293,7 +293,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; flag is set.
 ; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
 ; Vector version:
@@ -303,7 +303,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem nnan float 7.0, %extract
   store float %out, ptr %dest, align 4
@@ -315,7 +315,7 @@ define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; not promote on armv7.
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version: 
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
 ; Vector version:
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index 7ddf1d02834c381..06fc0cf4303f48c 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -76,9 +76,10 @@ define <4 x i16> @test_vextd16(ptr %A, ptr %B) nounwind {
 define <4 x i32> @test_vextq32(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: test_vextq32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vext.32 q8, q9, q8, #3
+; CHECK-NEXT:    add r0, r0, #12
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vld1.32 {d17[1]}, [r0:32]
+; CHECK-NEXT:    vext.32 q8, q8, q9, #3
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll
index d54446a431ee989..c412189633a9bdb 100644
--- a/llvm/test/CodeGen/ARM/vuzp.ll
+++ b/llvm/test/CodeGen/ARM/vuzp.ll
@@ -285,13 +285,13 @@ entry:
 define <4 x i32> @vuzp_lower_shufflemask_zeroed(ptr %A, ptr %B) {
 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr d18, [r0]
-; CHECK-NEXT:    vorr d19, d18, d18
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vtrn.32 d19, d17
-; CHECK-NEXT:    vdup.32 d16, d18[0]
-; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    add r0, r1, #4
+; CHECK-NEXT:    vld1.32 {d19[1]}, [r0:32]
+; CHECK-NEXT:    vdup.32 d18, d16[0]
+; CHECK-NEXT:    vtrn.32 d16, d19
+; CHECK-NEXT:    vmov r0, r1, d18
+; CHECK-NEXT:    vmov r2, r3, d19
 ; CHECK-NEXT:    mov pc, lr
 entry:
   %tmp1 = load <2 x i32>, ptr %A
diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 383e5ef19cebf1d..09e2cb89be618a0 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -5513,28 +5513,28 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS32R5-NEXT:    jr $ra
 ; MIPS32R5-NEXT:    nop
 ;
-; MIPS64R5-LABEL: mixed_i8:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -48
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 48
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    andi $1, $1, 255
-; MIPS64R5-NEXT:    mtc1 $1, $f0
-; MIPS64R5-NEXT:    cvt.s.w $f0, $f0
-; MIPS64R5-NEXT:    swc1 $f0, 36($sp)
-; MIPS64R5-NEXT:    swc1 $f0, 32($sp)
-; MIPS64R5-NEXT:    sd $4, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w1, 32($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w1, $w0
-; MIPS64R5-NEXT:    sd $6, 16($sp)
-; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
-; MIPS64R5-NEXT:    splati.w $w1, $w0[1]
-; MIPS64R5-NEXT:    add.s $f0, $f0, $f1
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 48
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: mixed_i8:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EB-NEXT:    sll $1, $5, 0
+; MIPS64R5EB-NEXT:    andi $1, $1, 255
+; MIPS64R5EB-NEXT:    mtc1 $1, $f0
+; MIPS64R5EB-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5EB-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5EB-NEXT:    swc1 $f0, 32($sp)
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $6
+; MIPS64R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    splati.w $w1, $w0[1]
+; MIPS64R5EB-NEXT:    add.s $f0, $f0, $f1
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS64EL-LABEL: mixed_i8:
 ; MIPS64EL:       # %bb.0: # %entry
@@ -5559,6 +5559,27 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS64EL-NEXT:    add.s $f0, $f1, $f0
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: mixed_i8:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EL-NEXT:    sll $1, $5, 0
+; MIPS64R5EL-NEXT:    andi $1, $1, 255
+; MIPS64R5EL-NEXT:    mtc1 $1, $f0
+; MIPS64R5EL-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5EL-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5EL-NEXT:    swc1 $f0, 32($sp)
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EL-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $6
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    splati.w $w1, $w0[1]
+; MIPS64R5EL-NEXT:    add.s $f0, $f0, $f1
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 entry:
   %0 = zext i8 %b to i32
   %1 = uitofp i32 %0 to float
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 4fc3f57aa002dfc..7c8c31c0ec181f9 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1014,34 +1014,54 @@ define i32 @extract_sext_v4i32() nounwind {
 ; O32:       # %bb.0:
 ; O32-NEXT:    lui $2, %hi(_gp_disp)
 ; O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; O32-NEXT:    addiu $sp, $sp, -32
+; O32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; O32-NEXT:    move $fp, $sp
+; O32-NEXT:    addiu $1, $zero, -16
+; O32-NEXT:    and $sp, $sp, $1
 ; O32-NEXT:    addu $1, $2, $25
 ; O32-NEXT:    lw $1, %got(v4i32)($1)
-; O32-NEXT:    ld.w $w0, 0($1)
+; O32-NEXT:    lw $1, 4($1)
+; O32-NEXT:    sw $1, 4($sp)
+; O32-NEXT:    ld.w $w0, 0($sp)
 ; O32-NEXT:    addv.w $w0, $w0, $w0
-; O32-NEXT:    jr $ra
 ; O32-NEXT:    copy_s.w $2, $w0[1]
+; O32-NEXT:    move $sp, $fp
+; O32-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; O32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; O32-NEXT:    jr $ra
+; O32-NEXT:    addiu $sp, $sp, 32
 ;
 ; N32-LABEL: extract_sext_v4i32:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32)))
 ; N32-NEXT:    lw $1, %got_disp(v4i32)($1)
-; N32-NEXT:    ld.w $w0, 0($1)
+; N32-NEXT:    lw $1, 4($1)
+; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    ld.w $w0, 0($sp)
 ; N32-NEXT:    addv.w $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.w $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_sext_v4i32:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32)))
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
-; N64-NEXT:    ld.w $w0, 0($1)
+; N64-NEXT:    lw $1, 4($1)
+; N64-NEXT:    sw $1, 4($sp)
+; N64-NEXT:    ld.w $w0, 0($sp)
 ; N64-NEXT:    addv.w $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.w $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <4 x i32>, ptr @v4i32
   %2 = add <4 x i32> %1, %1
   %3 = extractelement <4 x i32> %2, i32 1
@@ -1076,25 +1096,33 @@ define i64 @extract_sext_v2i64() nounwind {
 ;
 ; N32-LABEL: extract_sext_v2i64:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64)))
 ; N32-NEXT:    lw $1, %got_disp(v2i64)($1)
-; N32-NEXT:    ld.d $w0, 0($1)
+; N32-NEXT:    ld $1, 8($1)
+; N32-NEXT:    sd $1, 8($sp)
+; N32-NEXT:    ld.d $w0, 0($sp)
 ; N32-NEXT:    addv.d $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.d $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_sext_v2i64:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64)))
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
-; N64-NEXT:    ld.d $w0, 0($1)
+; N64-NEXT:    ld $1, 8($1)
+; N64-NEXT:    sd $1, 8($sp)
+; N64-NEXT:    ld.d $w0, 0($sp)
 ; N64-NEXT:    addv.d $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.d $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <2 x i64>, ptr @v2i64
   %2 = add <2 x i64> %1, %1
   %3 = extractelement <2 x i64> %2, i32 1
@@ -1186,34 +1214,54 @@ define i32 @extract_zext_v4i32() nounwind {
 ; O32:       # %bb.0:
 ; O32-NEXT:    lui $2, %hi(_gp_disp)
 ; O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; O32-NEXT:    addiu $sp, $sp, -32
+; O32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; O32-NEXT:    move $fp, $sp
+; O32-NEXT:    addiu $1, $zero, -16
+; O32-NEXT:    and $sp, $sp, $1
 ; O32-NEXT:    addu $1, $2, $25
 ; O32-NEXT:    lw $1, %got(v4i32)($1)
-; O32-NEXT:    ld.w $w0, 0($1)
+; O32-NEXT:    lw $1, 4($1)
+; O32-NEXT:    sw $1, 4($sp)
+; O32-NEXT:    ld.w $w0, 0($sp)
 ; O32-NEXT:    addv.w $w0, $w0, $w0
-; O32-NEXT:    jr $ra
 ; O32-NEXT:    copy_s.w $2, $w0[1]
+; O32-NEXT:    move $sp, $fp
+; O32-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; O32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; O32-NEXT:    jr $ra
+; O32-NEXT:    addiu $sp, $sp, 32
 ;
 ; N32-LABEL: extract_zext_v4i32:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32)))
 ; N32-NEXT:    lw $1, %got_disp(v4i32)($1)
-; N32-NEXT:    ld.w $w0, 0($1)
+; N32-NEXT:    lw $1, 4($1)
+; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    ld.w $w0, 0($sp)
 ; N32-NEXT:    addv.w $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.w $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_zext_v4i32:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32)))
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
-; N64-NEXT:    ld.w $w0, 0($1)
+; N64-NEXT:    lw $1, 4($1)
+; N64-NEXT:    sw $1, 4($sp)
+; N64-NEXT:    ld.w $w0, 0($sp)
 ; N64-NEXT:    addv.w $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.w $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <4 x i32>, ptr @v4i32
   %2 = add <4 x i32> %1, %1
   %3 = extractelement <4 x i32> %2, i32 1
@@ -1248,25 +1296,33 @@ define i64 @extract_zext_v2i64() nounwind {
 ;
 ; N32-LABEL: extract_zext_v2i64:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64)))
 ; N32-NEXT:    lw $1, %got_disp(v2i64)($1)
-; N32-NEXT:    ld.d $w0, 0($1)
+; N32-NEXT:    ld $1, 8($1)
+; N32-NEXT:    sd $1, 8($sp)
+; N32-NEXT:    ld.d $w0, 0($sp)
 ; N32-NEXT:    addv.d $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.d $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_zext_v2i64:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64)))
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
-; N64-NEXT:    ld.d $w0, 0($1)
+; N64-NEXT:    ld $1, 8($1)
+; N64-NEXT:    sd $1, 8($sp)
+; N64-NEXT:    ld.d $w0, 0($sp)
 ; N64-NEXT:    addv.d $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.d $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <2 x i64>, ptr @v2i64
   %2 = add <2 x i64> %1, %1
   %3 = extractelement <2 x i64> %2, i32 1
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index ca1b5fdabbf8ffc..3a82a8abd20c662 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -298,13 +298,13 @@ define i128 @srem_i128_pow2k(i128 %lhs) {
 define i128 @urem_i128_pow2k(i128 %lhs) {
 ; CHECK-LABEL: urem_i128_pow2k(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
-; CHECK-NEXT:    and.b64 %rd3, %rd1, 8589934591;
-; CHECK-NEXT:    mov.b64 %rd4, 0;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
+; CHECK-NEXT:    ld.param.u64 %rd1, [urem_i128_pow2k_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 8589934591;
+; CHECK-NEXT:    mov.b64 %rd3, 0;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, 8589934592
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index c2f166770a7ad95..e1079814a8e7a95 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -268,19 +268,19 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
 ; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
 ; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
+; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
 ; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
@@ -346,19 +346,19 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
 ; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
 ; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
+; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
 ; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
index 80c26471d8cdb0a..8eaa8d7713bcd43 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
@@ -15,9 +15,9 @@ define i32 @vec_struct_test(i32 %i, ptr nocapture readonly byval(%struct.vec_str
   ; 32BIT: bb.0.entry:
   ; 32BIT-NEXT:   liveins: $r3, $r5, $r6, $r7, $r8
   ; 32BIT-NEXT: {{  $}}
-  ; 32BIT-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
-  ; 32BIT-NEXT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
   ; 32BIT-NEXT:   STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
   ; 32BIT-NEXT:   STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
   ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r5, killed renamable $r3
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index 7f6fdc7f88cd113..b5607e3d91e1055 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -1105,16 +1105,11 @@ define <2 x i64> @testSplati64_1(ptr nocapture readonly %ptr) #0 {
 ;
 ; CHECK-NOVSX-LABEL: testSplati64_1:
 ; CHECK-NOVSX:       # %bb.0: # %entry
-; CHECK-NOVSX-NEXT:    ld r4, 8(r3)
-; CHECK-NOVSX-NEXT:    std r4, -8(r1)
-; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    ld r3, 8(r3)
+; CHECK-NOVSX-NEXT:    std r3, -8(r1)
 ; CHECK-NOVSX-NEXT:    std r3, -16(r1)
 ; CHECK-NOVSX-NEXT:    addi r3, r1, -16
 ; CHECK-NOVSX-NEXT:    lvx v2, 0, r3
-; CHECK-NOVSX-NEXT:    addis r3, r2, .LCPI21_0 at toc@ha
-; CHECK-NOVSX-NEXT:    addi r3, r3, .LCPI21_0 at toc@l
-; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
-; CHECK-NOVSX-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-NOVSX-NEXT:    blr
 ;
 ; CHECK-P7-LABEL: testSplati64_1:
diff --git a/llvm/test/CodeGen/PowerPC/const-stov.ll b/llvm/test/CodeGen/PowerPC/const-stov.ll
index 69c68a4f27371ef..c32c1ff1fc06baf 100644
--- a/llvm/test/CodeGen/PowerPC/const-stov.ll
+++ b/llvm/test/CodeGen/PowerPC/const-stov.ll
@@ -132,28 +132,29 @@ entry:
 define  <2 x i64> @i64(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: i64:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lxvd2x v2, 0, r3
+; PWR7-BE-NEXT:    lfd f0, 0(r3)
 ; PWR7-BE-NEXT:    li r3, 10
+; PWR7-BE-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; PWR7-BE-NEXT:    std r3, -16(r1)
-; PWR7-BE-NEXT:    std r3, -8(r1)
-; PWR7-BE-NEXT:    addi r3, r1, -16
-; PWR7-BE-NEXT:    lxvd2x v3, 0, r3
+; PWR7-BE-NEXT:    lfd f0, -16(r1)
+; PWR7-BE-NEXT:    xxpermdi v3, vs0, vs0, 1
 ; PWR7-BE-NEXT:    xxmrghd v2, v2, v3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: i64:
 ; PWR8-BE:       # %bb.0: # %entry
-; PWR8-BE-NEXT:    lxvd2x v2, 0, r3
+; PWR8-BE-NEXT:    lfd f0, 0(r3)
 ; PWR8-BE-NEXT:    li r3, 10
+; PWR8-BE-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; PWR8-BE-NEXT:    mtfprd f0, r3
 ; PWR8-BE-NEXT:    xxmrghd v2, v2, vs0
 ; PWR8-BE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: i64:
 ; PWR8-LE:       # %bb.0: # %entry
-; PWR8-LE-NEXT:    lxvd2x vs0, 0, r3
+; PWR8-LE-NEXT:    lfd f0, 0(r3)
 ; PWR8-LE-NEXT:    li r3, 10
-; PWR8-LE-NEXT:    xxswapd v2, vs0
+; PWR8-LE-NEXT:    xxspltd v2, vs0, 0
 ; PWR8-LE-NEXT:    mtfprd f0, r3
 ; PWR8-LE-NEXT:    xxpermdi v2, vs0, v2, 1
 ; PWR8-LE-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll
index ee4d4ff9c6c790a..beb63ce0127bcff 100644
--- a/llvm/test/CodeGen/PowerPC/pr27078.ll
+++ b/llvm/test/CodeGen/PowerPC/pr27078.ll
@@ -4,19 +4,21 @@
 define <4 x float> @bar(ptr %p, ptr %q) {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 5, 16
-; CHECK-NEXT:    lxvw4x 1, 0, 3
-; CHECK-NEXT:    lxvw4x 3, 0, 4
-; CHECK-NEXT:    xvsubsp 35, 3, 1
-; CHECK-NEXT:    lxvw4x 0, 3, 5
-; CHECK-NEXT:    lxvw4x 2, 4, 5
+; CHECK-NEXT:    li 5, 24
+; CHECK-NEXT:    lxvw4x 1, 0, 4
+; CHECK-NEXT:    lfiwzx 0, 3, 5
+; CHECK-NEXT:    xxmrghw 34, 0, 0
+; CHECK-NEXT:    lfiwzx 0, 4, 5
 ; CHECK-NEXT:    addis 5, 2, .LCPI0_0 at toc@ha
 ; CHECK-NEXT:    addi 5, 5, .LCPI0_0 at toc@l
 ; CHECK-NEXT:    lxvw4x 36, 0, 5
-; CHECK-NEXT:    li 5, 32
-; CHECK-NEXT:    xvsubsp 34, 2, 0
-; CHECK-NEXT:    lxvw4x 0, 3, 5
-; CHECK-NEXT:    lxvw4x 1, 4, 5
+; CHECK-NEXT:    li 5, 36
+; CHECK-NEXT:    xxmrghw 35, 0, 0
+; CHECK-NEXT:    lxvw4x 0, 0, 3
+; CHECK-NEXT:    xvsubsp 34, 35, 34
+; CHECK-NEXT:    xvsubsp 35, 1, 0
+; CHECK-NEXT:    lfiwzx 0, 3, 5
+; CHECK-NEXT:    lfiwzx 1, 4, 5
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_1 at toc@ha
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_1 at toc@l
 ; CHECK-NEXT:    vperm 2, 3, 2, 4
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index 4435484ae0b9475..1a897f3498ab9f7 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -317,24 +317,28 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex
 ; P9BE-AIX32-LABEL: test16:
 ; P9BE-AIX32:       # %bb.0: # %entry
 ; P9BE-AIX32-NEXT:    slwi 4, 4, 1
-; P9BE-AIX32-NEXT:    li 6, 0
 ; P9BE-AIX32-NEXT:    lhzux 4, 3, 4
 ; P9BE-AIX32-NEXT:    lhz 3, 16(3)
-; P9BE-AIX32-NEXT:    sth 6, -64(1)
-; P9BE-AIX32-NEXT:    lxv 2, -64(1)
 ; P9BE-AIX32-NEXT:    sth 4, -48(1)
-; P9BE-AIX32-NEXT:    lxv 4, -48(1)
 ; P9BE-AIX32-NEXT:    sth 3, -32(1)
+; P9BE-AIX32-NEXT:    li 3, 0
+; P9BE-AIX32-NEXT:    sth 3, -64(1)
+; P9BE-AIX32-NEXT:    lwz 3, -32(1)
+; P9BE-AIX32-NEXT:    lxv 3, -64(1)
+; P9BE-AIX32-NEXT:    mtfprwz 0, 3
+; P9BE-AIX32-NEXT:    lwz 3, -48(1)
+; P9BE-AIX32-NEXT:    xxinsertw 2, 0, 0
+; P9BE-AIX32-NEXT:    mtfprwz 0, 3
 ; P9BE-AIX32-NEXT:    lwz 3, L..C3(2) # %const.0
-; P9BE-AIX32-NEXT:    lxv 3, -32(1)
-; P9BE-AIX32-NEXT:    vmrghh 4, 2, 4
+; P9BE-AIX32-NEXT:    vmrghh 2, 3, 2
+; P9BE-AIX32-NEXT:    xxinsertw 4, 0, 0
+; P9BE-AIX32-NEXT:    vmrghh 4, 3, 4
+; P9BE-AIX32-NEXT:    vsplth 3, 3, 0
 ; P9BE-AIX32-NEXT:    lxv 0, 0(3)
-; P9BE-AIX32-NEXT:    vmrghh 3, 2, 3
-; P9BE-AIX32-NEXT:    vsplth 2, 2, 0
-; P9BE-AIX32-NEXT:    xxmrghw 2, 2, 4
-; P9BE-AIX32-NEXT:    xxperm 3, 2, 0
-; P9BE-AIX32-NEXT:    xxspltw 2, 3, 1
-; P9BE-AIX32-NEXT:    vadduwm 2, 3, 2
+; P9BE-AIX32-NEXT:    xxmrghw 3, 3, 4
+; P9BE-AIX32-NEXT:    xxperm 2, 3, 0
+; P9BE-AIX32-NEXT:    xxspltw 3, 2, 1
+; P9BE-AIX32-NEXT:    vadduwm 2, 2, 3
 ; P9BE-AIX32-NEXT:    stxv 2, -16(1)
 ; P9BE-AIX32-NEXT:    lwz 3, -16(1)
 ; P9BE-AIX32-NEXT:    cmpw 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
index 3ab49cd39f8d804..cefe5ad7b9e7741 100644
--- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -251,9 +251,13 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_none_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r4)
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -331,8 +335,12 @@ define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -411,8 +419,12 @@ define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -494,9 +506,13 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r4)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -640,9 +656,11 @@ define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> %
 ; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -721,9 +739,11 @@ define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -984,9 +1004,11 @@ define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> %
 ; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1059,9 +1081,11 @@ define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1413,8 +1437,12 @@ define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1493,9 +1521,11 @@ define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %ar
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1568,9 +1598,11 @@ define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %ar
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1638,10 +1670,10 @@ define <16 x i8> @test_v4i32_v4i32(i32 %arg, i32 %arg1, <4 x i32> %a, <4 x i32>
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1719,10 +1751,12 @@ define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1866,10 +1900,10 @@ define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32>
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1941,10 +1975,12 @@ define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
index fcfcda586694d53..e7596e8cb78884b 100644
--- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
@@ -99,15 +99,14 @@ entry:
 define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v16i8:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -115,13 +114,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-BE-P8-LABEL: test_none_v16i8:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v16i8:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -129,13 +128,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-64-P8-LABEL: test_none_v16i8:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v16i8:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -144,7 +143,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -153,7 +153,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -167,15 +168,14 @@ entry:
 define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v16i8_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v16i8_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -183,13 +183,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-BE-P8-LABEL: test_v16i8_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v16i8_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -197,13 +197,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-64-P8-LABEL: test_v16i8_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v16i8_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -212,7 +212,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -221,7 +222,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -377,15 +379,14 @@ entry:
 define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v8i16_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v8i16_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -393,13 +394,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_v8i16_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v8i16_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -407,13 +408,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -422,7 +423,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -431,7 +433,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -445,15 +448,14 @@ entry:
 define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v8i16:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v8i16:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -461,13 +463,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_none_v8i16:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v8i16:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -475,13 +477,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -490,7 +492,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -499,7 +502,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -655,15 +659,14 @@ entry:
 define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v4i32:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v4i32:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -671,13 +674,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_none_v4i32:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v4i32:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -685,13 +688,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -700,7 +703,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -709,7 +713,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -723,15 +728,14 @@ entry:
 define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v4i32_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v4i32_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -739,13 +743,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_v4i32_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v4i32_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -753,13 +757,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -768,7 +772,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -777,7 +782,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -934,12 +940,12 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -48(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r5
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r4
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -955,43 +961,48 @@ entry:
 define <2 x i64> @test_none_v2i64(ptr nocapture noundef readonly %b, i64 %arg) {
 ; CHECK-LE-P8-LABEL: test_none_v2i64:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P8-NEXT:    xxspltd v2, vs0, 0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v2i64:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    xxspltd v2, vs0, 0
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r4
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test_none_v2i64:
 ; CHECK-BE-P8:       # %bb.0: # %entry
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-BE-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-BE-P8-NEXT:    mtfprd f0, r4
 ; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v2i64:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-BE-P9-NEXT:    mtfprd f0, r4
 ; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-BE-P9-NEXT:    blr
 ;
 ; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r4
 ; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -1031,55 +1042,53 @@ entry:
 define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) {
 ; CHECK-LE-P8-LABEL: test_v2i64_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r4
-; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v2i64_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-LE-P9-NEXT:    mtfprd f0, r4
-; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P9-NEXT:    ld r3, 0(r3)
+; CHECK-LE-P9-NEXT:    mtvsrdd v2, r3, r4
 ; CHECK-LE-P9-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test_v2i64_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-BE-P8-NEXT:    mtfprd f0, r4
-; CHECK-BE-P8-NEXT:    lxvd2x v3, 0, r3
-; CHECK-BE-P8-NEXT:    xxspltd v2, vs0, 0
-; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, v3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v2i64_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-BE-P9-NEXT:    mtvsrdd v3, r4, r4
-; CHECK-BE-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-P9-NEXT:    ld r3, 0(r3)
+; CHECK-BE-P9-NEXT:    mtvsrdd v2, r4, r3
 ; CHECK-BE-P9-NEXT:    blr
 ;
 ; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v3, 0, r3
-; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs0
-; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-AIX-64-P9-NEXT:    mtvsrdd v3, r4, r4
-; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    ld r3, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v2, r4, r3
 ; CHECK-AIX-64-P9-NEXT:    blr
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r3)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
@@ -1089,11 +1098,12 @@ define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1536,13 +1546,13 @@ define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
 ; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v3, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    xxperm v2, v3, vs0
@@ -1713,12 +1723,12 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r3
 ; CHECK-AIX-32-P9-NEXT:    stw r5, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1793,12 +1803,12 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r3
 ; CHECK-AIX-32-P9-NEXT:    sth r5, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
index 47ffdb4625ed399..a4aa8eac2033dcf 100644
--- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -339,9 +339,11 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0
 ; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
-; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
 ; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v3, v3, v3
 ; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
diff --git a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
index 66c1b6f6d26daaa..cc2fe5604a3714b 100644
--- a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -45,18 +45,16 @@ define <2 x double> @test01(ptr %p1, ptr %p2) {
 define <2 x double> @test02(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test02:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxmrgld 34, 1, 0
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test02:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxmrgld 34, 1, 0
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -67,18 +65,16 @@ define <2 x double> @test02(ptr %p1, ptr %p2) {
 define <2 x double> @test03(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test03:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 1, 0, 1
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 8(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test03:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 1, 0, 1
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -123,18 +119,16 @@ define <2 x double> @test11(ptr %p1, ptr %p2) {
 define <2 x double> @test12(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 1, 0, 2
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test12:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 1, 0, 2
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -145,17 +139,15 @@ define <2 x double> @test12(ptr %p1, ptr %p2) {
 define <2 x double> @test13(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 8(4)
 ; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test13:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
 ; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
@@ -167,18 +159,16 @@ define <2 x double> @test13(ptr %p1, ptr %p2) {
 define <2 x double> @test20(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test20:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxmrgld 34, 0, 1
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test20:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxmrgld 34, 0, 1
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -189,18 +179,16 @@ define <2 x double> @test20(ptr %p1, ptr %p2) {
 define <2 x double> @test21(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test21:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 0, 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test21:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 0, 1, 1
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -244,18 +232,16 @@ define <2 x double> @test23(ptr %p1, ptr %p2) {
 define <2 x double> @test30(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test30:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 0, 1, 2
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 8(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test30:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 0, 1, 2
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -266,17 +252,15 @@ define <2 x double> @test30(ptr %p1, ptr %p2) {
 define <2 x double> @test31(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test31:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 8(4)
 ; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test31:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
 ; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 41d8abb9b73ebca..b1735ec832d9bdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define <4 x bfloat> @shuffle_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; CHECK-LABEL: shuffle_v4bf16:
@@ -385,12 +385,19 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) {
 }
 
 define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
-; CHECK-LABEL: vrgather_shuffle_vx_v4f16_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a0, 2(a0)
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vrgather_shuffle_vx_v4f16_load:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    flh fa5, 2(a0)
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa5
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vrgather_shuffle_vx_v4f16_load:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lh a0, 2(a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    ret
   %v = load <4 x half>, ptr %p
   %s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x half> %s
diff --git a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
index 941ae78cc9a79ae..18ff9034a453031 100644
--- a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
@@ -5,10 +5,9 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret1_f16_sf:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmov d0, r0, r1
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    ldr r0, [sp, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    vmov r0, s0
@@ -22,11 +21,10 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 define half @extret4_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret4_f16_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    ldrh.w r0, [sp, #8]
 ; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    ldr r0, [sp, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    bx lr
@@ -98,11 +96,10 @@ define arm_aapcs_vfpcc <8 x half> @extret4_v8f16_hf(<8 x half> %a, <8 x half> %b
 define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret1_f32_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d0, r0, r1
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldr s1, [sp, #4]
+; CHECK-NEXT:    vmov d2, r0, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    bx lr
@@ -115,11 +112,10 @@ define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 define float @extret2_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret2_f32_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldr s2, [sp, #8]
+; CHECK-NEXT:    vmov d3, r2, r3
 ; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
index 5f56a82f3c51109..78c754f712bfabc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
@@ -128,9 +128,11 @@ define <8 x i16> @inserti8_last_zext(ptr %p) {
 define <8 x i32> @inserti32_first(ptr %p) {
 ; CHECKLE-LABEL: inserti32_first:
 ; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    ldr r1, [r0, #16]
 ; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #20]
-; CHECKLE-NEXT:    vldr s4, [r0, #16]
 ; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKLE-NEXT:    vmov.32 q1[3], r1
+; CHECKLE-NEXT:    vmov.f32 s4, s7
 ; CHECKLE-NEXT:    vmov.f32 s5, s8
 ; CHECKLE-NEXT:    vmov.f32 s6, s9
 ; CHECKLE-NEXT:    vmov.f32 s7, s10
@@ -138,14 +140,16 @@ define <8 x i32> @inserti32_first(ptr %p) {
 ;
 ; CHECKBE-LABEL: inserti32_first:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q3, [r0, #20]
 ; CHECKBE-NEXT:    vldrb.u8 q1, [r0]
-; CHECKBE-NEXT:    vldr s8, [r0, #16]
-; CHECKBE-NEXT:    vmov.f32 s9, s12
+; CHECKBE-NEXT:    ldr r1, [r0, #16]
+; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #20]
 ; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    vmov.f32 s10, s13
-; CHECKBE-NEXT:    vmov.f32 s11, s14
-; CHECKBE-NEXT:    vrev64.32 q1, q2
+; CHECKBE-NEXT:    vmov.32 q1[3], r1
+; CHECKBE-NEXT:    vmov.f32 s12, s7
+; CHECKBE-NEXT:    vmov.f32 s13, s8
+; CHECKBE-NEXT:    vmov.f32 s14, s9
+; CHECKBE-NEXT:    vmov.f32 s15, s10
+; CHECKBE-NEXT:    vrev64.32 q1, q3
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <8 x i32>, ptr %q
@@ -158,24 +162,28 @@ define <8 x i32> @inserti32_first(ptr %p) {
 define <8 x i32> @inserti32_last(ptr %p) {
 ; CHECKLE-LABEL: inserti32_last:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q2, [r0]
-; CHECKLE-NEXT:    vldr s3, [r0, #16]
+; CHECKLE-NEXT:    ldr r1, [r0, #16]
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #20]
-; CHECKLE-NEXT:    vmov.f32 s0, s9
-; CHECKLE-NEXT:    vmov.f32 s1, s10
-; CHECKLE-NEXT:    vmov.f32 s2, s11
+; CHECKLE-NEXT:    vmov.32 q2[0], r1
+; CHECKLE-NEXT:    vmov.f32 s0, s1
+; CHECKLE-NEXT:    vmov.f32 s1, s2
+; CHECKLE-NEXT:    vmov.f32 s2, s3
+; CHECKLE-NEXT:    vmov.f32 s3, s8
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti32_last:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q3, [r0]
-; CHECKBE-NEXT:    vldrb.u8 q0, [r0, #20]
-; CHECKBE-NEXT:    vldr s11, [r0, #16]
-; CHECKBE-NEXT:    vmov.f32 s8, s13
-; CHECKBE-NEXT:    vrev64.8 q1, q0
-; CHECKBE-NEXT:    vmov.f32 s9, s14
-; CHECKBE-NEXT:    vmov.f32 s10, s15
-; CHECKBE-NEXT:    vrev64.32 q0, q2
+; CHECKBE-NEXT:    ldr r1, [r0, #16]
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vldrb.u8 q2, [r0, #20]
+; CHECKBE-NEXT:    vmov.32 q1[0], r1
+; CHECKBE-NEXT:    vmov.f32 s12, s1
+; CHECKBE-NEXT:    vmov.f32 s15, s4
+; CHECKBE-NEXT:    vrev64.8 q1, q2
+; CHECKBE-NEXT:    vmov.f32 s13, s2
+; CHECKBE-NEXT:    vmov.f32 s14, s3
+; CHECKBE-NEXT:    vrev64.32 q0, q3
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll
index 0cfafdd86863e53..9f3dffd75cfa873 100644
--- a/llvm/test/CodeGen/X86/SwizzleShuff.ll
+++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll
@@ -19,7 +19,7 @@ define void @pull_bitcast(ptr %pA, ptr %pB) {
 define <4 x i32> @multi_use_swizzle(ptr %pA, ptr %pB) {
 ; CHECK-LABEL: multi_use_swizzle:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2]
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index 0bfd8921e8b42a8..e5eb366fe22e303 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -370,12 +370,12 @@ define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone
 ; X86-LABEL: load_splat_4i32_4i32_1111:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X86-NEXT:    vbroadcastss 4(%eax), %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: load_splat_4i32_4i32_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, ptr %ptr
@@ -477,7 +477,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone
 ;
 ; X64-LABEL: load_splat_2i64_2i64_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index 4ce092c099b0887..3b52773db867395 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -81,13 +81,15 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
   %2 = load <4 x float>, ptr %1, align 16
diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
index e971d1e471bf7a1..49cc58c8e73a8a9 100644
--- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
+++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -9,14 +9,16 @@ define void @test1(ptr %A, ptr %C) #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -34,15 +36,15 @@ define void @test2(ptr %A, ptr %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X86-NEXT:    vorps (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X64-NEXT:    vorps (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -60,15 +62,15 @@ define void @test3(ptr %A, ptr %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X86-NEXT:    vxorps (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X64-NEXT:    vxorps (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -86,14 +88,16 @@ define void @test4(ptr %A, ptr %C) #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = [2147483647,0,0,0]
+; X86-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm1 = [2147483647,0,0,0]
+; X64-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 0fae921b1ca83db..20550fc4eb9fa61 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -1158,7 +1158,7 @@ define <16 x i32> @masked_inc_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re
 ; CHECK-LABEL: masked_inc_test:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -1171,7 +1171,7 @@ define <16 x i32> @masked_dec_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re
 ; CHECK-LABEL: masked_dec_test:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 7e48b3719cf0ffa..5d7c337c94dee26 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -18,18 +18,18 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
 ; AVX512F-NEXT:    kmovw %edi, %k4
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
@@ -37,7 +37,7 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm2 & (zmm0 ^ zmm4))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: add_v64i8_broadcasts:
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index c27cced9d5ffa72..23b46ee59154fba 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -166,7 +166,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL-NEXT:    pushq %rax
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    callq _func16xi1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -194,7 +194,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL_X32-NEXT:    subl $12, %esp
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
 ; KNL_X32-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL_X32-NEXT:    calll _func16xi1
 ; KNL_X32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index e46378804be5e5e..56d6d136fdb6143 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -190,7 +190,7 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vcmpnltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
 ; KNL-NEXT:    vpsrld $31, %ymm1, %ymm1
 ; KNL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 79e59fdcf4a1255..d19eaf4459fee5d 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1603,7 +1603,7 @@ define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
 ; KNL-LABEL: zext_16i1_to_16xi32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrld $31, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1629,7 +1629,7 @@ define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 ; KNL-LABEL: zext_8i1_to_8xi64:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrlq $63, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1747,14 +1747,14 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_8i1_8i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_8i1_8i32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512DQ-NEXT:    retq
   %x = icmp slt <8 x i32> %a1, %a2
   %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
@@ -1840,7 +1840,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_16i1_16i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_16i1_16i32:
@@ -2313,12 +2313,12 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm2, %ymm2
 ; KNL-NEXT:    vpmovdw %zmm3, %ymm3
 ; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 6c661eb771d1bdd..5115c3cdc259a70 100644
--- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -485,7 +485,7 @@ define void @load_v64i1_broadcast_32_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
@@ -642,7 +642,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -654,7 +654,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -1426,7 +1426,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
@@ -1596,7 +1596,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1607,7 +1607,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516c14..80b4ae494209733 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5824,8 +5824,10 @@ define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %
 ;
 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm2
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__W, i64 0
@@ -5876,8 +5878,10 @@ define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double>
 ;
 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm2
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__A, i64 0
@@ -5932,8 +5936,11 @@ define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double
 ;
 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm3, %xmm0, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
 ; X64-NEXT:    vmovapd %xmm2, %xmm0
 ; X64-NEXT:    retq
 entry:
@@ -5986,8 +5993,10 @@ define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext
 ;
 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__W, i64 0
@@ -6038,8 +6047,10 @@ define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double>
 ;
 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__A, i64 0
@@ -6094,8 +6105,10 @@ define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x doubl
 ;
 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X64-NEXT:    vmovapd %xmm2, %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-load-store.ll b/llvm/test/CodeGen/X86/avx512-load-store.ll
index c32c3d9b8550392..ce6bfa90d88a79b 100644
--- a/llvm/test/CodeGen/X86/avx512-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-store.ll
@@ -143,7 +143,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK64-LABEL: test_mm_mask_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss:
@@ -151,7 +151,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -168,7 +168,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd:
@@ -176,7 +176,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -192,7 +192,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK64-LABEL: test_mm_maskz_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss:
@@ -200,7 +200,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -215,7 +215,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK64-LABEL: test_mm_maskz_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd:
@@ -223,7 +223,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -283,7 +283,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
@@ -291,7 +291,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -306,7 +306,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
@@ -314,7 +314,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -328,7 +328,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK64-LABEL: test_mm_mask_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
@@ -336,7 +336,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -351,7 +351,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
@@ -359,7 +359,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index e53e194ba05c2ac..23f4fcb1c77c64f 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -856,7 +856,7 @@ entry:
 define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_and_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $8, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 & zmm1 & ~zmm0
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -867,7 +867,7 @@ define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_or_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $206, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm1 & ~zmm0) | zmm2
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -878,7 +878,7 @@ define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_xor_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $198, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 ^ (zmm1 & ~zmm0)
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -889,7 +889,7 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 ; ALL-LABEL: ternlog_or_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & mem) | zmm1
 ; ALL-NEXT:    retq
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -899,7 +899,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 ; ALL-LABEL: ternlog_xor_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
 ; ALL-NEXT:    retq
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y
@@ -911,7 +911,7 @@ define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsrad $31, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogd $224, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 & (zmm3 | zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <16 x i32> %mask, zeroinitializer
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -925,7 +925,7 @@ define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i6
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsraq $63, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogq $96, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 & (zmm3 ^ zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <8 x i64> %mask, zeroinitializer
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 536c667c7ec9023..721ffbe1ceb7973 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -502,7 +502,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X86-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X86-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X86-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X86-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X86-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X86-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X86-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
@@ -515,7 +515,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X64-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X64-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X64-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X64-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X64-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X64-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index aac5847061cbec8..c2afa2b971d7525 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -227,9 +227,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,8,11,8,13,8,15,9]
+; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -243,10 +243,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
@@ -259,10 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,15,1,14,1,12,11,10]
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -271,12 +270,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,15,1,14,1,12,11,10]
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -288,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,15,1,14,1,12,11,10]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -778,9 +776,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17]
+; CHECK-NEXT:    vpbroadcastw 22(%rdi), %ymm3
+; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -795,10 +793,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17]
+; CHECK-NEXT:    vpbroadcastw 22(%rdi), %ymm1
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1041,8 +1039,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i
 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm0
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
@@ -1051,8 +1050,9 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -1066,8 +1066,9 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -1081,12 +1082,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3]
+; CHECK-NEXT:    vpbroadcastd 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,4,7]
+; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3]
+; CHECK-NEXT:    vpbroadcastd 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,4,7]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1114,9 +1114,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vpermi2d 12(%rdi){1to4}, %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -1130,10 +1130,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d 12(%rdi){1to4}, %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
@@ -1609,9 +1609,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm0
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1621,11 +1622,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm3
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1638,10 +1639,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm1
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1654,9 +1656,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vpbroadcastd 28(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,13,11,10,0,0,0,0]
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -1671,10 +1673,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0]
+; CHECK-NEXT:    vpbroadcastd 28(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,0,0,0,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1853,8 +1855,9 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i
 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    vmovsd 24(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1863,9 +1866,10 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vmovq 24(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1877,9 +1881,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vmovq 24(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1891,10 +1896,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -1906,10 +1911,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
-; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -2374,17 +2379,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,7,6,0]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $240, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2398,17 +2403,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4]
+; CHECK-FAST-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vmovq (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $240, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2422,17 +2427,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,1,1,5]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2446,17 +2452,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2470,16 +2477,16 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2]
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,4,4,6]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 56(%rdi), %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vunpcklpd (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2488,17 +2495,17 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,4,4,6]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2512,16 +2519,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,4,4,6]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2567,17 +2575,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,6,3,5]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $63, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2591,16 +2599,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,6,3,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $63, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2614,9 +2623,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,6,7,6]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2625,11 +2634,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2]
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,7,6]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2641,10 +2650,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2]
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,6,7,6]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2656,17 +2666,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,7,5,1]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2680,17 +2691,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5]
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2712,8 +2724,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm0
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2732,10 +2745,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2756,10 +2769,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2771,9 +2784,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm2
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 48(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2785,9 +2799,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm1
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 48(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2946,9 +2961,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5]
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2957,9 +2972,9 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [6,2,4,5]
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
@@ -2974,11 +2989,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -2991,12 +3006,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm2
+; CHECK-NEXT:    vbroadcastss 24(%rdi), %xmm3
+; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3008,12 +3023,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; CHECK-NEXT:    vbroadcastss 24(%rdi), %xmm2
+; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3025,13 +3040,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,5,7,3]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3043,12 +3057,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [7,5,7,3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3060,10 +3074,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,7,1,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3072,13 +3085,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,7,1,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3090,12 +3102,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,7,1,7]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3680,14 +3692,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 48(%rdi), %xmm2
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklps 24(%rdi){1to4}, %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
@@ -3699,14 +3707,10 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 48(%rdi), %xmm1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vunpcklps 24(%rdi){1to4}, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
@@ -3718,9 +3722,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm0
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3730,12 +3735,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm3
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3748,11 +3753,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3835,8 +3841,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double>
 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovhps 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3845,11 +3851,11 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2
-; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3861,11 +3867,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm1
-; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3877,10 +3883,11 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 16(%rdi), %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3892,10 +3899,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 16(%rdi), %xmm1
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -4382,8 +4390,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,0]
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4392,10 +4400,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vblendpd $12, (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1],mem[2,3]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4407,8 +4416,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,0]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4417,10 +4426,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vblendpd $12, (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[2,3]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4432,21 +4442,22 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,6,7,0]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[0,1]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
+; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm3[1],ymm2[0],ymm3[3],ymm2[2]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4458,20 +4469,22 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,6,7,0]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
+; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm1[0],ymm2[3],ymm1[2]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4481,26 +4494,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
 }
 
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm1 # ymm1 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm3 # ymm3 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4509,14 +4539,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm2 # ymm2 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4559,14 +4600,25 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp,
 }
 
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,1,1,1]
+; CHECK-FAST-NEXT:    vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 16(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4575,14 +4627,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,1,1,1]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovddup 40(%rdi), %xmm1 # xmm1 = mem[0,0]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 16(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4591,26 +4654,47 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp,
 }
 
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm1 # ymm1 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm3 # ymm3 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4619,15 +4703,27 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm2 # ymm2 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4668,8 +4764,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp,
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm0
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovhps 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4678,10 +4774,11 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 48(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4693,10 +4790,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm1
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 48(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4711,7 +4809,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
 ; CHECK-NEXT:    vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vunpcklpd 32(%rdi){1to2}, %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4726,7 +4824,7 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp,
 ; CHECK-NEXT:    vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vunpcklpd 32(%rdi){1to2}, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
index b7b1212e767222b..0df466cdab5bbcf 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
@@ -826,7 +826,8 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <
 define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -835,9 +836,10 @@ define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2
 define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -850,9 +852,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, pt
 define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -864,9 +867,10 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec
 define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -879,9 +883,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, pt
 define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
index d0b183dfeae6ebf..73e2c7e564c759d 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
@@ -826,7 +826,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %ve
 define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -835,9 +835,10 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -850,9 +851,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -864,9 +866,10 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double>
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -879,9 +882,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -2223,7 +2227,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %v
 define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2234,7 +2238,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -2249,7 +2253,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2263,7 +2267,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -2278,7 +2282,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index f4eb5b952ae4361..f93a1fff51f3902 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1938,7 +1938,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; X64-NEXT:    vmovw (%rsi), %xmm0
 ; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -1948,7 +1949,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; X86-NEXT:    vmovw (%eax), %xmm0
 ; X86-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X86-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %6 = load i8, ptr %4, align 1
@@ -2111,7 +2113,7 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
+; X86-NEXT:    vpaddd 36(%ebp){1to8}, %ymm1, %ymm1
 ; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
 ; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index d92e1a1e7b9d495..be02a6071ab5825 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -467,8 +467,9 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ;
 ; X86XOP-LABEL: test_bitreverse_i8:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; X86XOP-NEXT:    vmovd %xmm0, %eax
 ; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
 ; X86XOP-NEXT:    retl
@@ -533,8 +534,9 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ;
 ; X86XOP-LABEL: test_bitreverse_i4:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; X86XOP-NEXT:    vmovd %xmm0, %eax
 ; X86XOP-NEXT:    shrb $4, %al
 ; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 4b0e5441b4abf10..dd62183904c8842 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -77,19 +77,15 @@ entry:
 ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'.
 
 define <2 x double> @test_negative_zero_2(<2 x double> %A) {
-; SSE2-LABEL: test_negative_zero_2:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_negative_zero_2:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_negative_zero_2:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_negative_zero_2:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %A, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll
index 7aa6628cb7f3918..1f2c0ee0c83d9db 100644
--- a/llvm/test/CodeGen/X86/combine-fabs.ll
+++ b/llvm/test/CodeGen/X86/combine-fabs.ll
@@ -40,7 +40,8 @@ define <4 x float> @combine_vec_fabs_constant() {
 define float @combine_fabs_fabs(float %a) {
 ; SSE-LABEL: combine_fabs_fabs:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fabs:
@@ -73,7 +74,8 @@ define <4 x float> @combine_vec_fabs_fabs(<4 x float> %a) {
 define float @combine_fabs_fneg(float %a) {
 ; SSE-LABEL: combine_fabs_fneg:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fneg:
@@ -106,7 +108,8 @@ define <4 x float> @combine_vec_fabs_fneg(<4 x float> %a) {
 define float @combine_fabs_fcopysign(float %a, float %b) {
 ; SSE-LABEL: combine_fabs_fcopysign:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fcopysign:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 42f09d04da26eda..8a502eebf0d4ea1 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1609,10 +1609,12 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ;
 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm1
 ; XOP-NEXT:    vpsrlq $62, %xmm1, %xmm1
 ; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpshaq %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; XOP-NEXT:    retq
   %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
@@ -1739,7 +1741,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpsrlq $62, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpshaq %xmm1, %xmm3, %xmm1
 ; XOP-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -3051,7 +3054,8 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 55715197830b14a..58675f515655344 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -631,8 +631,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $15, %xmm0
-; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    psrlw $7, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = [1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
index 75511104580e903..2ae06f1ab43c0bf 100644
--- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
@@ -70,7 +70,8 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nou
 define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, ptr %b) #0 {
 ; CHECK-LABEL: commute_fold_vblendpd_128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; CHECK-NEXT:    retq
   %1 = load <2 x double>, ptr %b
   %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
@@ -81,7 +82,8 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
 define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 {
 ; CHECK-LABEL: commute_fold_vblendpd_256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; CHECK-NEXT:    vbroadcastsd 24(%rdi), %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; CHECK-NEXT:    retq
   %1 = load <4 x double>, ptr %b
   %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb13f..aa6a5367558429a 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -26,7 +26,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
 define <2 x double> @commute_fold_blendpd(<2 x double> %a, ptr %b) {
 ; CHECK-LABEL: commute_fold_blendpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %1 = load <2 x double>, ptr %b
   %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
@@ -54,11 +54,11 @@ define <4 x i32> @commute_fold_blend_v4i32(ptr %a, <4 x i32> %b) {
 define void @baz(ptr %arg, ptr %arg1) optsize {
 ; CHECK-LABEL: baz:
 ; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    movaps (%rdi), %xmm0
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [3,3]
-; CHECK-NEXT:    andps %xmm0, %xmm1
-; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    movups %xmm1, (%rsi)
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm1
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    movdqu %xmm1, (%rsi)
 ; CHECK-NEXT:    retq
 bb:
   %tmp = load <2 x i64>, ptr %arg, align 16
diff --git a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
index 0052359eedb50aa..0d25e85e2042f11 100644
--- a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -13,7 +13,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define double @mag_pos0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double 0.0, double %x)
   ret double %y
@@ -25,7 +26,8 @@ define double @mag_pos0_double(double %x) nounwind {
 define double @mag_neg0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double -0.0, double %x)
   ret double %y
@@ -41,7 +43,8 @@ define double @mag_pos1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos1_double:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double 1.0, double %x)
   ret double %y
@@ -58,7 +61,8 @@ define double @mag_neg1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg1_double:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double -1.0, double %x)
   ret double %y
@@ -73,7 +77,8 @@ define double @mag_neg1_double(double %x) nounwind {
 define float @mag_pos0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float 0.0, float %x)
   ret float %y
@@ -85,7 +90,8 @@ define float @mag_pos0_float(float %x) nounwind {
 define float @mag_neg0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float -0.0, float %x)
   ret float %y
@@ -103,7 +109,8 @@ define float @mag_pos1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos1_float:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float 1.0, float %x)
   ret float %y
@@ -124,7 +131,8 @@ define float @mag_neg1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg1_float:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float -1.0, float %x)
   ret float %y
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index f12693469a3f6ec..dba07d80c6cd667 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -153,14 +153,16 @@ define <16 x i64> @load_catcat(ptr %p) {
 define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) {
 ; SSE-LABEL: cat_ext_straddle:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps 16(%rdi), %xmm0
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: cat_ext_straddle:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
   %x = load <6 x i32>, ptr %px
   %y = load <6 x i32>, ptr %py
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 944f6bbfd0bfbe0..dfc81b3c3fb915d 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -353,9 +353,9 @@ define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z,
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovaps 8(%ebp), %xmm3
 ; X86-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -546,7 +546,8 @@ define float @fabs_v4f32(<4 x float> %x) nounwind {
 define double @fabs_v4f64(<4 x double> %x) nounwind {
 ; X64-LABEL: fabs_v4f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0]
+; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 022b25a24153338..e7758acf126d631 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -13,10 +13,16 @@ define i32 @t(ptr %val) nounwind  {
 ; X86-SSE2-NEXT:    movl 8(%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: t:
-; X64:       # %bb.0:
-; X64-NEXT:    movl 8(%rdi), %eax
-; X64-NEXT:    retq
+; X64-SSSE3-LABEL: t:
+; X64-SSSE3:       # %bb.0:
+; X64-SSSE3-NEXT:    movl 8(%rdi), %eax
+; X64-SSSE3-NEXT:    retq
+;
+; X64-AVX-LABEL: t:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    vextractps $2, %xmm0, %eax
+; X64-AVX-NEXT:    retq
   %tmp2 = load <2 x i64>, ptr %val, align 16		; <<2 x i64>> [#uses=1]
   %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
   %tmp4 = extractelement <4 x i32> %tmp3, i32 2		; <i32> [#uses=1]
@@ -76,9 +82,11 @@ bb:
 define i64 @t4(ptr %a) {
 ; X86-SSE2-LABEL: t4:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %eax
-; X86-SSE2-NEXT:    movl 4(%ecx), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-LABEL: t4:
@@ -126,8 +134,7 @@ define float @t6(ptr%a0) {
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movaps (%eax), %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -142,7 +149,7 @@ define float @t6(ptr%a0) {
 ;
 ; X64-SSSE3-LABEL: t6:
 ; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -226,8 +233,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movaps (%eax), %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -241,7 +247,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ;
 ; X64-SSSE3-LABEL: PR43971_1:
 ; X64-SSSE3:       # %bb.0: # %entry
-; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -317,12 +323,27 @@ define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture
 ; X86-SSE2-NEXT:    movw $-24160, (%eax) # imm = 0xA1A0
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: subextract_broadcast_load_constant:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
-; X64-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
-; X64-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
-; X64-NEXT:    retq
+; X64-SSSE3-LABEL: subextract_broadcast_load_constant:
+; X64-SSSE3:       # %bb.0:
+; X64-SSSE3-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-SSSE3-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
+; X64-SSSE3-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-SSSE3-NEXT:    retq
+;
+; X64-AVX1-LABEL: subextract_broadcast_load_constant:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-AVX1-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; X64-AVX1-NEXT:    movw %ax, (%rsi)
+; X64-AVX1-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: subextract_broadcast_load_constant:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-AVX2-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
+; X64-AVX2-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-AVX2-NEXT:    retq
   store i8 -98, ptr %0, align 1
   %4 = getelementptr inbounds i8, ptr %0, i64 1
   store i8 -97, ptr %4, align 1
diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll
index 82c82ac3e917e35..d553cb7516fab4f 100644
--- a/llvm/test/CodeGen/X86/fabs.ll
+++ b/llvm/test/CodeGen/X86/fabs.ll
@@ -21,7 +21,8 @@ define float @test1(float %X) {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %Y = call float @fabsf(float %X) readnone
   ret float %Y
diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
index 128f5ee0c318bc5..240da2c8478492a 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
@@ -40,8 +40,9 @@ define float @fneg_f32(float %x) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushl %eax
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; SSE2-NEXT:    movss %xmm0, (%esp)
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    xorps %xmm0, %xmm1
+; SSE2-NEXT:    movss %xmm1, (%esp)
 ; SSE2-NEXT:    flds (%esp)
 ; SSE2-NEXT:    popl %eax
 ; SSE2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/fma-signed-zero.ll b/llvm/test/CodeGen/X86/fma-signed-zero.ll
index f9e4e9929c6c4a9..080469bd7d6da8e 100644
--- a/llvm/test/CodeGen/X86/fma-signed-zero.ll
+++ b/llvm/test/CodeGen/X86/fma-signed-zero.ll
@@ -10,7 +10,8 @@ define float @fneg_fma32(float %x, float %y, float %z) {
 ; CHECK-LABEL: fneg_fma32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %negx = fneg float %x
   %negz = fneg float %z
@@ -37,7 +38,8 @@ define double @fneg_fma64(double %x, double %y, double %z) {
 ; CHECK-LABEL: fneg_fma64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; CHECK-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %negx = fneg double %x
   %negz = fneg double %z
diff --git a/llvm/test/CodeGen/X86/fp-fold.ll b/llvm/test/CodeGen/X86/fp-fold.ll
index 74b5232a4df62d5..93716a48542ea95 100644
--- a/llvm/test/CodeGen/X86/fp-fold.ll
+++ b/llvm/test/CodeGen/X86/fp-fold.ll
@@ -31,7 +31,7 @@ define float @fadd_produce_zero(float %x) {
 define float @fadd_reassociate(float %x) {
 ; CHECK-LABEL: fadd_reassociate:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %sum = fadd float %x, 8.0
   %r = fadd reassoc nsz float %sum, 12.0
@@ -85,7 +85,7 @@ define float @fsub_neg_x_y(float %x, float %y) {
 define float @fsub_neg_y(float %x, float %y) {
 ; CHECK-LABEL: fsub_neg_y:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul float %x, 5.0
   %add = fadd float %mul, %y
@@ -96,7 +96,7 @@ define float @fsub_neg_y(float %x, float %y) {
 define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_vector:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 5.0, float 5.0, float 5.0>
   %add = fadd <4 x float> %mul, %y
@@ -107,7 +107,7 @@ define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) {
 define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_vector_nonuniform:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 6.0, float 7.0, float 8.0>
   %add = fadd <4 x float> %mul, %y
@@ -118,7 +118,7 @@ define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y)
 define float @fsub_neg_y_commute(float %x, float %y) {
 ; CHECK-LABEL: fsub_neg_y_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul float %x, 5.0
   %add = fadd float %y, %mul
@@ -129,7 +129,7 @@ define float @fsub_neg_y_commute(float %x, float %y) {
 define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_commute_vector:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 5.0, float 5.0, float 5.0>
   %add = fadd <4 x float> %y, %mul
@@ -142,7 +142,8 @@ define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) {
 define float @fsub_fadd_common_op_fneg(float %x, float %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd float %x, %y
   %r = fsub reassoc nsz float %y, %a
@@ -154,7 +155,7 @@ define float @fsub_fadd_common_op_fneg(float %x, float %y) {
 define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd <4 x float> %x, %y
   %r = fsub nsz reassoc <4 x float> %y, %a
@@ -167,7 +168,8 @@ define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y)
 define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd float %y, %x
   %r = fsub reassoc nsz float %y, %a
@@ -179,7 +181,7 @@ define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) {
 define <4 x float> @fsub_fadd_common_op_fneg_commute_vec(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd <4 x float> %y, %x
   %r = fsub reassoc nsz <4 x float> %y, %a
@@ -233,7 +235,8 @@ define float @fsub_zero_nsz_1(float %x) {
 define float @fsub_zero_nsz_2(float %x) {
 ; CHECK-LABEL: fsub_zero_nsz_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %r = fsub nsz float 0.0, %x
   ret float %r
@@ -259,7 +262,7 @@ define float @fmul_one(float %x) {
 define float @fmul_x_const_const(float %x) {
 ; CHECK-LABEL: fmul_x_const_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul reassoc float %x, 9.0
   %r = fmul reassoc float %mul, 4.0
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
index 71d49481ebb8e71..5bd48d80354fc6a 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
@@ -9,7 +9,8 @@ define float @f1(float %0, float %1, float %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm0
 ; NOFMA-NEXT:    callq fmaf at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -37,7 +38,8 @@ define double @f2(double %0, double %1, double %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm0
 ; NOFMA-NEXT:    callq fma at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -65,7 +67,8 @@ define float @f3(float %0, float %1, float %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; NOFMA-NEXT:    movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm2
 ; NOFMA-NEXT:    callq fmaf at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -93,7 +96,8 @@ define double @f4(double %0, double %1, double %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; NOFMA-NEXT:    movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm2
 ; NOFMA-NEXT:    callq fma at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -184,7 +188,8 @@ define float @f7(float %0, float %1, float %2) #0 {
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
 ; NOFMA-NEXT:    callq fmaf at PLT
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm1, %xmm0
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
 ; NOFMA-NEXT:    retq
@@ -192,13 +197,15 @@ define float @f7(float %0, float %1, float %2) #0 {
 ; FMA-AVX1-LABEL: f7:
 ; FMA-AVX1:       # %bb.0: # %entry
 ; FMA-AVX1-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; FMA-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA-AVX1-NEXT:    retq
 ;
 ; FMA4-LABEL: f7:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;
 ; FMA-AVX512-LABEL: f7:
@@ -221,7 +228,8 @@ define double @f8(double %0, double %1, double %2) #0 {
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
 ; NOFMA-NEXT:    callq fma at PLT
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm1, %xmm0
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
 ; NOFMA-NEXT:    retq
@@ -229,13 +237,15 @@ define double @f8(double %0, double %1, double %2) #0 {
 ; FMA-LABEL: f8:
 ; FMA:       # %bb.0: # %entry
 ; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; FMA-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f8:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
-; FMA4-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
@@ -262,13 +272,15 @@ define float @f9(float %0, float %1, float %2) #0 {
 ; FMA-AVX1-LABEL: f9:
 ; FMA-AVX1:       # %bb.0: # %entry
 ; FMA-AVX1-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; FMA-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA-AVX1-NEXT:    retq
 ;
 ; FMA4-LABEL: f9:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;
 ; FMA-AVX512-LABEL: f9:
@@ -304,13 +316,15 @@ define double @f10(double %0, double %1, double %2) #0 {
 ; FMA-LABEL: f10:
 ; FMA:       # %bb.0: # %entry
 ; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; FMA-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f10:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
-; FMA4-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %3 = fneg double %0
diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll
index 522a1589caf09f4..a21204cb307717c 100644
--- a/llvm/test/CodeGen/X86/fp-logic.ll
+++ b/llvm/test/CodeGen/X86/fp-logic.ll
@@ -243,7 +243,8 @@ define float @movmsk(float %x) {
 define double @bitcast_fabs(double %x) {
 ; CHECK-LABEL: bitcast_fabs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [NaN,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast double %x to i64
   %and = and i64 %bc1, 9223372036854775807
@@ -254,7 +255,8 @@ define double @bitcast_fabs(double %x) {
 define float @bitcast_fneg(float %x) {
 ; CHECK-LABEL: bitcast_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
   %xor = xor i32 %bc1, 2147483648
@@ -311,7 +313,8 @@ define float @fsub_bitcast_fneg(float %x, float %y) {
 define float @nabsf(float %a) {
 ; CHECK-LABEL: nabsf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %conv = bitcast float %a to i32
   %and = or i32 %conv, -2147483648
@@ -322,7 +325,8 @@ define float @nabsf(float %a) {
 define double @nabsd(double %a) {
 ; CHECK-LABEL: nabsd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %conv = bitcast double %a to i64
   %and = or i64 %conv, -9223372036854775808
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 8efd5819a6d22b4..2ea83598219a314 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -24,10 +24,11 @@ define half @round_f16(half %h) {
 ; SSE41-NEXT:    callq __extendhfsf2 at PLT
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andps %xmm0, %xmm1
-; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addss %xmm0, %xmm1
+; SSE41-NEXT:    movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0]
+; SSE41-NEXT:    orps %xmm1, %xmm2
+; SSE41-NEXT:    addss %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundss $11, %xmm2, %xmm0
 ; SSE41-NEXT:    callq __truncsfhf2 at PLT
 ; SSE41-NEXT:    popq %rax
 ; SSE41-NEXT:    .cfi_def_cfa_offset 8
@@ -55,7 +56,7 @@ define half @round_f16(half %h) {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -67,7 +68,7 @@ define half @round_f16(half %h) {
 ; AVX512FP16:       ## %bb.0: ## %entry
 ; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, %xmm1, %xmm0, %xmm2
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1)
 ; AVX512FP16-NEXT:    vaddsh %xmm2, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -85,10 +86,11 @@ define float @round_f32(float %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andps %xmm0, %xmm1
-; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addss %xmm0, %xmm1
+; SSE41-NEXT:    movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0]
+; SSE41-NEXT:    orps %xmm1, %xmm2
+; SSE41-NEXT:    addss %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundss $11, %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: round_f32:
@@ -103,7 +105,7 @@ define float @round_f32(float %x) {
 ; AVX512F-LABEL: round_f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -111,7 +113,7 @@ define float @round_f32(float %x) {
 ; AVX512FP16-LABEL: round_f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -128,10 +130,11 @@ define double @round_f64(double %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andpd %xmm0, %xmm1
-; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addsd %xmm0, %xmm1
+; SSE41-NEXT:    movsd {{.*#+}} xmm2 = [4.9999999999999994E-1,0.0E+0]
+; SSE41-NEXT:    orpd %xmm1, %xmm2
+; SSE41-NEXT:    addsd %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundsd $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundsd $11, %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: round_f64:
@@ -147,7 +150,7 @@ define double @round_f64(double %x) {
 ; AVX512F-LABEL: round_f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -155,7 +158,7 @@ define double @round_f64(double %x) {
 ; AVX512FP16-LABEL: round_f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -213,7 +216,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512F-LABEL: round_v4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -221,7 +224,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512FP16-LABEL: round_v4f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -267,7 +270,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512F-LABEL: round_v2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -275,7 +278,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512FP16-LABEL: round_v2f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -361,7 +364,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512F-LABEL: round_v8f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -369,7 +372,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512FP16-LABEL: round_v8f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -431,7 +434,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512F-LABEL: round_v4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -439,7 +442,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512FP16-LABEL: round_v4f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -587,7 +590,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512F-LABEL: round_v16f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -595,7 +598,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512FP16-LABEL: round_v16f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
@@ -695,7 +698,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512F-LABEL: round_v8f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -703,7 +706,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512FP16-LABEL: round_v8f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba1b0..d16fdcdf1752dac 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -1260,7 +1260,8 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X64-SSE-NEXT:    pushq %rax
 ; X64-SSE-NEXT:    callq __trunctfdf2 at PLT
 ; X64-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = [+Inf,0.0E+0]
+; X64-SSE-NEXT:    orps %xmm1, %xmm0
 ; X64-SSE-NEXT:    callq __extenddftf2 at PLT
 ; X64-SSE-NEXT:    addq $8, %rsp
 ; X64-SSE-NEXT:  .LBB26_2: # %cleanup
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 3af8b1aec1feb2e..5a1f0da86de6332 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -335,7 +335,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fabs:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; F16C-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
@@ -352,7 +353,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
@@ -515,7 +517,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fneg:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; F16C-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
@@ -532,7 +535,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 362b3b945f9622d..57695316386b78b 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -311,8 +311,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
@@ -320,8 +320,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ;
 ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
-; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0ca3380d188b7fb..2d8484fb82faef4 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -2147,7 +2147,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: splatvar_fshl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; GFNISSE-NEXT:    movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm9
 ; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
@@ -2247,24 +2247,25 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 ;
 ; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; GFNIAVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8:
@@ -2286,7 +2287,7 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: splatvar_fshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movd {{.*#+}} xmm9 = mem[0],zero,zero,zero
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
 ; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
@@ -2389,25 +2390,26 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 ;
 ; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; GFNIAVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 6e7f109a5da5c29..df5f8892c879f69 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1272,23 +1272,21 @@ define <8 x half> @select(i1 %c, <8 x half> %x, <8 x half> %y) {
 define <8 x half> @shuffle(ptr %p) {
 ; CHECK-LIBCALL-LABEL: shuffle:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    movdqu (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-LIBCALL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-LIBCALL-NEXT:    pinsrw $0, 8(%rdi), %xmm0
+; CHECK-LIBCALL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: shuffle:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4]
-; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; BWON-F16C-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: shuffle:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    movdqu (%eax), %xmm0
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-I686-NEXT:    pinsrw $0, 8(%eax), %xmm0
+; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; CHECK-I686-NEXT:    retl
   %1 = load <8 x half>, ptr %p, align 8
   %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index c44945ac2d929f6..20aa93bf10ec285 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -145,9 +145,9 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 ;
 ; X64-SSE2-LABEL: elt0_v2i64:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq %rdi, %xmm1
-; X64-SSE2-NEXT:    movapd {{.*#+}} xmm0 = [u,1]
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE2-NEXT:    movq %rdi, %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: elt0_v2i64:
@@ -218,28 +218,26 @@ define <4 x float> @elt1_v4f32(float %x) {
 define <2 x double> @elt1_v2f64(double %x) {
 ; X86-SSE-LABEL: elt1_v2f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: elt1_v2f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt1_v2f64:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1]
-; X86-AVX-NEXT:    # xmm0 = mem[0,0]
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt1_v2f64:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
-; X64-AVX-NEXT:    # xmm1 = mem[0,0]
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
@@ -384,7 +382,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64-SSE2-LABEL: elt5_v8i64:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq %rdi, %xmm0
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4,u]
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = [4,0]
 ; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
@@ -457,7 +455,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 define <8 x double> @elt1_v8f64(double %x) {
 ; X86-SSE-LABEL: elt1_v8f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -466,7 +464,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X64-SSE-LABEL: elt1_v8f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm4 = [4.2E+1,u]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm4 = [4.2E+1,0.0E+0]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -476,47 +474,49 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X86-AVX1-LABEL: elt1_v8f64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
-; X86-AVX1-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX1-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: elt1_v8f64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX2-LABEL: elt1_v8f64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
-; X86-AVX2-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX2-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: elt1_v8f64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
+; X64-AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: elt1_v8f64:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X86-AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: elt1_v8f64:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X64-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
    %ins = insertelement <8 x double> <double 42.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, double %x, i32 1
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll
index 18edc83b7edcf7d..ee7af27b0ac706e 100644
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
 define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) {
 ; SSE-LABEL: extract_lane_insertps_5123:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; SSE-NEXT:    movss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract_lane_insertps_5123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a1 = load <4 x float>, ptr%p1
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64)
@@ -285,13 +285,12 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) {
 define float @extract_lane_insertps_6123(<4 x float> %a0, ptr%p1) {
 ; SSE-LABEL: extract_lane_insertps_6123:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract_lane_insertps_6123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilpd $1, (%rdi), %xmm0 # xmm0 = mem[1,0]
+; AVX-NEXT:    vmovss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a1 = load <4 x float>, ptr%p1
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
@@ -308,7 +307,8 @@ define <4 x float> @commute_load_insertps(<4 x float>, ptr nocapture readonly) {
 ;
 ; AVX-LABEL: commute_load_insertps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vinsertps $53, 12(%rdi), %xmm0, %xmm0 # xmm0 = zero,xmm0[1],zero,mem[0]
+; AVX-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[1],zero,xmm1[3]
 ; AVX-NEXT:    retq
   %3 = load <4 x float>, ptr %1
   %4 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %3, <4 x float> %0, i8 85)
diff --git a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
index f03df634dc1de47..99ed327c36c3e22 100644
--- a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
+++ b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
@@ -7,12 +7,14 @@
 define <4 x float> @fold_from_constantpool(<4 x float> %a) {
 ; X86-LABEL: fold_from_constantpool:
 ; X86:       # %bb.0:
-; X86-NEXT:    insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0 # xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm1 # xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fold_from_constantpool:
 ; X64:       # %bb.0:
-; X64-NEXT:    insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0 # xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> <float 0.0, float 1.0, float 0.0, float 0.0>, i8 64)
   ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
index 93b60c27255f330..29737b3acf55e5f 100644
--- a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
+++ b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -11,7 +11,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movaps (%eax), %xmm0
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -19,7 +19,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    movaps (%rdi), %xmm0
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %a = getelementptr inbounds <4 x float>, ptr %v1, i64 0, i64 1
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 97136dafa6c2c00..198d1a08a9342dc 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2595,8 +2595,9 @@ define i1 @issubnormal_or_zero_or_nan_f(float %x) {
 ;
 ; X64-LABEL: issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm0, %xmm1
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
@@ -2764,8 +2765,9 @@ define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
 ;
 ; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm0, %xmm1
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
index fb7efc2200c671d..ad8878a6f83b7f6 100644
--- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
+++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
@@ -5,8 +5,11 @@ define void @csrot_(ptr %0) {
 ; CHECK-LABEL: csrot_:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    movlps %xmm0, (%rax)
 ; CHECK-NEXT:    retq
 1:
diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll
index dba63582ff08b16..3a1cfcb9c9a6fab 100644
--- a/llvm/test/CodeGen/X86/load-partial.ll
+++ b/llvm/test/CodeGen/X86/load-partial.ll
@@ -211,13 +211,11 @@ define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly derefer
 ; SSE2-LABEL: load_float4_float3_trunc_0123:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm0
-; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_float4_float3_trunc_0123:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movaps (%rdi), %xmm0
-; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_float4_float3_trunc_0123:
@@ -257,13 +255,11 @@ define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readon
 ; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movups (%rdi), %xmm0
-; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movups (%rdi), %xmm0
-; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177d1..7cba05d61a1c3d7 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6791,7 +6791,8 @@ define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) {
 ; AVX1OR2-LABEL: mload_constmask_v8f64:
 ; AVX1OR2:       ## %bb.0:
 ; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
-; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX1OR2-NEXT:    vbroadcastsd 56(%rdi), %ymm2
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v8f64:
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7ec5e87dcc6bd2..1a8200c322973fc 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -4701,13 +4701,13 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
 ; SSE2-NEXT:    movq %xmm5, (%rdi)
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm5, 8(%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = mem[2,3,2,3]
-; SSE2-NEXT:    movq %xmm5, 24(%rdi)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
 ; SSE2-NEXT:    movq %rax, 32(%rdi)
 ; SSE2-NEXT:    movq %xmm4, 48(%rdi)
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -4733,11 +4733,10 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE4-NEXT:    movups %xmm6, (%rdi)
-; SSE4-NEXT:    palignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; SSE4-NEXT:    movdqu %xmm5, 24(%rdi)
+; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT:    movups %xmm5, (%rdi)
+; SSE4-NEXT:    movups {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT:    movups %xmm5, 24(%rdi)
 ; SSE4-NEXT:    movups %xmm4, 48(%rdi)
 ; SSE4-NEXT:    movups %xmm3, 64(%rdi)
 ; SSE4-NEXT:    movups %xmm2, 80(%rdi)
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 73d459ba7702649..751805db38ec06d 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -325,14 +325,11 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    andq (%rsi), %rax
-; X64-NEXT:    movq %rax, %xmm0
 ; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    por %xmm0, %xmm1
-; X64-NEXT:    movq %xmm1, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    orq (%rsi), %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movq %rax, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 802098250981983..84bbf18dde36a3d 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -10,8 +10,9 @@ define float @negfp(float %a, float %b) nounwind {
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm0, %xmm1
+; CHECK-NEXT:    movss %xmm1, (%esp)
 ; CHECK-NEXT:    flds (%esp)
 ; CHECK-NEXT:    popl %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll
index f24507d3a4f38af..3368dde860875e9 100644
--- a/llvm/test/CodeGen/X86/negative-sin.ll
+++ b/llvm/test/CodeGen/X86/negative-sin.ll
@@ -56,7 +56,8 @@ define double @semi_strict1(double %e) nounwind {
 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    callq sin at PLT
-; CHECK-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
   %f = fsub double 0.0, %e
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 384e40496d82a6b..ce82ad7857dda89 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -118,25 +118,45 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
 }
 
 define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm2 = [1,0,0,0]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
-; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE4-NEXT:    pmovsxbd {{.*#+}} xmm2 = [1,1,1,1]
-; SSE4-NEXT:    pand %xmm2, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE4-NEXT:    pand %xmm2, %xmm0
-; SSE4-NEXT:    packusdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; X64-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = [1,0]
+; X64-SSE2-NEXT:    pand %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
+;
+; X86-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
+; X86-SSE4:       # %bb.0:
+; X86-SSE4-NEXT:    movd {{.*#+}} xmm2 = [1,0,0,0]
+; X86-SSE4-NEXT:    pand %xmm0, %xmm2
+; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X86-SSE4-NEXT:    retl
+;
+; X64-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    movq {{.*#+}} xmm2 = [1,0]
+; X64-SSE4-NEXT:    pand %xmm0, %xmm2
+; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
 ; X86-AVX1:       # %bb.0:
@@ -447,8 +467,4 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; X64-AVX2: {{.*}}
-; X64-SSE2: {{.*}}
-; X64-SSE4: {{.*}}
 ; X86-AVX2: {{.*}}
-; X86-SSE2: {{.*}}
-; X86-SSE4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
index c0a6e00ec695e4c..46d65e6f375e5fa 100644
--- a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
+++ b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
@@ -18,7 +18,7 @@ define dso_local void @foo1(double %a.coerce0, double %a.coerce1, double %b.coer
 ; CHECK-NEXT:    movq %rsp, %rdi
 ; CHECK-NEXT:    callq foo3 at PLT
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,u]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    addpd %xmm0, %xmm1
 ; CHECK-NEXT:    movapd %xmm1, g(%rip)
diff --git a/llvm/test/CodeGen/X86/pr14161.ll b/llvm/test/CodeGen/X86/pr14161.ll
index cdf3757e05b20d6..a38ad03117855b1 100644
--- a/llvm/test/CodeGen/X86/pr14161.ll
+++ b/llvm/test/CodeGen/X86/pr14161.ll
@@ -24,7 +24,8 @@ entry:
 define <2 x i16> @bad(ptr, ptr) {
 ; CHECK-LABEL: bad:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    pinsrd $1, 4(%rdi), %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr30511.ll b/llvm/test/CodeGen/X86/pr30511.ll
index 088f3bfef8542b1..0a4428d0a74e525 100644
--- a/llvm/test/CodeGen/X86/pr30511.ll
+++ b/llvm/test/CodeGen/X86/pr30511.ll
@@ -7,8 +7,9 @@ target triple = "x86_64-pc-linux-gnu"
 define i64 @PR30511(<2 x double> %a) {
 ; CHECK-LABEL: PR30511:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [6.755399441055744E+15,0.0E+0]
+; CHECK-NEXT:    addpd %xmm0, %xmm1
+; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm0
 ; CHECK-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movq %xmm0, %rax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll
index 38b55a5c32a617f..692cdaff33fc1e7 100644
--- a/llvm/test/CodeGen/X86/pr31956.ll
+++ b/llvm/test/CodeGen/X86/pr31956.ll
@@ -9,10 +9,11 @@ target triple = "x86_64-scei-ps4"
 define <4 x float> @foo() {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps G2(%rip), %xmm0
-; CHECK-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; CHECK-NEXT:    vbroadcastss G2+16(%rip), %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero
+; CHECK-NEXT:    vbroadcastss G2+24(%rip), %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; CHECK-NEXT:    retq
 entry:
   %V = load <2 x float>, ptr @G1, align 8
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index aed5ea3ed217b7e..517d93f487883e4 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -52,24 +52,24 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-O3-NEXT:    movq %rsp, %rbp
 ; CHECK-O3-NEXT:    andq $-32, %rsp
 ; CHECK-O3-NEXT:    subq $32, %rsp
-; CHECK-O3-NEXT:    vmovdqa 208(%rbp), %ymm3
-; CHECK-O3-NEXT:    vmovdqa 144(%rbp), %ymm0
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; CHECK-O3-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O3-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-O3-NEXT:    vpbroadcastq 248(%rbp), %ymm4
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; CHECK-O3-NEXT:    vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
+; CHECK-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-O3-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-O3-NEXT:    vbroadcastsd 160(%rbp), %ymm3
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; CHECK-O3-NEXT:    vbroadcastsd 216(%rbp), %ymm4
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,1]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; CHECK-O3-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; CHECK-O3-NEXT:    vbroadcastsd 248(%rbp), %ymm4
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
 ; CHECK-O3-NEXT:    movq %rbp, %rsp
 ; CHECK-O3-NEXT:    popq %rbp
 ; CHECK-O3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr36553.ll b/llvm/test/CodeGen/X86/pr36553.ll
index b61ec8147308117..17649d43352d292 100644
--- a/llvm/test/CodeGen/X86/pr36553.ll
+++ b/llvm/test/CodeGen/X86/pr36553.ll
@@ -8,7 +8,8 @@ define float @pr36553(float %a, float %b, float %c) nounwind {
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq _fmaf
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll
index 7851856713e82aa..63bfbcec1e1dae8 100644
--- a/llvm/test/CodeGen/X86/pr40811.ll
+++ b/llvm/test/CodeGen/X86/pr40811.ll
@@ -4,10 +4,11 @@
 define <8 x i32> @_Z6test70v(ptr %id14793) {
 ; CHECK-LABEL: _Z6test70v:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr63091.ll b/llvm/test/CodeGen/X86/pr63091.ll
index 3f50be8ab8df9a6..9f4700e94df6813 100644
--- a/llvm/test/CodeGen/X86/pr63091.ll
+++ b/llvm/test/CodeGen/X86/pr63091.ll
@@ -35,9 +35,10 @@ define <4 x i32> @dont_merge_pcmpgt(<16 x i8> %0, <4 x i32> %1) {
 define <4 x i32> @merge_and(<16 x i8> %0, <4 x i32> %1) {
 ; SSE-LABEL: merge_and:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pinsrd $3, {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_and:
diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll
index 245af74c238912e..234dbbd620d5149 100644
--- a/llvm/test/CodeGen/X86/sar_fold64.ll
+++ b/llvm/test/CodeGen/X86/sar_fold64.ll
@@ -99,16 +99,18 @@ define <4 x i32> @all_sign_bit_ashr_vec0(<4 x i32> %x) {
 define <4 x i32> @all_sign_bit_ashr_vec1(<4 x i32> %x) {
 ; SSE-LABEL: all_sign_bit_ashr_vec1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [1,0,0,0]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    pxor %xmm0, %xmm0
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: all_sign_bit_ashr_vec1:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
@@ -159,16 +161,18 @@ define <4 x i32> @all_sign_bit_ashr_vec2(<4 x i32> %x) {
 define <4 x i32> @all_sign_bit_ashr_vec3(<4 x i32> %x) {
 ; SSE-LABEL: all_sign_bit_ashr_vec3:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [1,0,0,0]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: all_sign_bit_ashr_vec3:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index e723569bda8a126..193f2df95a04542 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -352,7 +352,7 @@ define i64 @sub_constant_to_shift_to_add(i32 %x, i64 %s1, i64 %s2) {
 define float @olt(float %x) {
 ; CHECK-LABEL: olt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -365,7 +365,7 @@ define float @olt(float %x) {
 define double @ogt(double %x) {
 ; CHECK-LABEL: ogt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorpd %xmm0, %xmm1
 ; CHECK-NEXT:    maxsd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -481,7 +481,7 @@ define double @ogt_no_fneg(double %x, double %y) {
 define double @ogt_no_zero(double %x) {
 ; CHECK-LABEL: ogt_no_zero:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorpd %xmm0, %xmm1
 ; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    cmpltsd %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 2ac2be5545dfdcf..a69f13839e53fae 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,22 +119,18 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
-; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
-; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
-; CHECK-AVX2-NEXT:    vmovq %xmm6, %r10
-; CHECK-AVX2-NEXT:    negq %r10
-; CHECK-AVX2-NEXT:    movq %rcx, %r10
-; CHECK-AVX2-NEXT:    sbbq %r8, %r10
-; CHECK-AVX2-NEXT:    setge %r8b
-; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
+; CHECK-AVX2-NEXT:    movq 1040(%rdx,%rsi), %rdi
+; CHECK-AVX2-NEXT:    movq 1024(%rdx,%rsi), %r8
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    movq %rcx, %rdi
+; CHECK-AVX2-NEXT:    sbbq 1048(%rdx,%rsi), %rdi
+; CHECK-AVX2-NEXT:    setge %dil
+; CHECK-AVX2-NEXT:    movzbl %dil, %edi
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm5
 ; CHECK-AVX2-NEXT:    negq %r8
-; CHECK-AVX2-NEXT:    vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT:    negq %r9
-; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq %rdi, %r8
+; CHECK-AVX2-NEXT:    movq %rcx, %rdi
+; CHECK-AVX2-NEXT:    sbbq 1032(%rdx,%rsi), %rdi
 ; CHECK-AVX2-NEXT:    setge %dil
 ; CHECK-AVX2-NEXT:    movzbl %dil, %edi
 ; CHECK-AVX2-NEXT:    negq %rdi
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed458779755..e83151f3eaa1e80 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1864,9 +1864,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [65536,0,0,0]
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $32, %xmm1
+; X86-SSE-NEXT:    movq %xmm1, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst3:
@@ -1885,9 +1886,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    psllq $32, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [65536,0,0,0]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    psllq $32, %xmm1
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_2xi16_varconst3:
@@ -1922,9 +1924,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [32768,0,0,0]
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $32, %xmm1
+; X86-SSE-NEXT:    movq %xmm1, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst4:
@@ -1943,9 +1946,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    psllq $32, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [32768,0,0,0]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    psllq $32, %xmm1
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_2xi16_varconst4:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706cfc..3e880589566cc37 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -384,22 +384,26 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
 define <4 x double> @PR34175(ptr %p) {
 ; AVX512F-LABEL: PR34175:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpinsrw $0, 48(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpinsrw $0, 32(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR34175:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpinsrw $0, 48(%rdi), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpinsrw $0, 32(%rdi), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512VL-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
index 76a95e250457004..459f128472ef021 100644
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -388,13 +388,13 @@ define <8 x i64> @pr23259() #1 {
 ; AVX-LABEL: pr23259:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm1[2,3]
+; AVX-NEXT:    vpinsrq $0, A+16(%rip), %xmm1, %xmm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: pr23259:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovaps A+16(%rip), %xmm0
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3,4,5,6,7]
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 2b78a70ebcc26f5..fad383c7b46b0c0 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -20,7 +20,8 @@ define float @f32_no_daz(float %f) #0 {
 ; NHM-NEXT:    mulss %xmm1, %xmm2
 ; NHM-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; NHM-NEXT:    mulss %xmm3, %xmm2
-; NHM-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NHM-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; NHM-NEXT:    andps %xmm1, %xmm0
 ; NHM-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; NHM-NEXT:    andnps %xmm2, %xmm0
 ; NHM-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
index 85f7733e671a761..31de79ea0fe64b9 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
@@ -12,7 +12,8 @@ define float @f32_tune_nhm(float %f) #0 {
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-NEXT:    mulss %xmm3, %xmm2
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    andnps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -49,7 +50,8 @@ define float @f32_tune_x86_64(float %f) #3 {
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-NEXT:    mulss %xmm3, %xmm2
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    andnps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb958..0b304136ccfea37 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -764,16 +764,18 @@ define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x
 define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
 ; SSE-LABEL: div_sqrt_fabs_f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    sqrtsd %xmm2, %xmm2
-; SSE-NEXT:    mulsd %xmm2, %xmm1
-; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = [NaN,0.0E+0]
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    mulsd %xmm2, %xmm3
+; SSE-NEXT:    divsd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: div_sqrt_fabs_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vsqrtsd %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = [NaN,0.0E+0]
+; AVX-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b678..d794340d14701d9 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -1014,7 +1014,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
@@ -1036,7 +1037,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm2
 ; CHECK-SSE41-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1051,7 +1052,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
 ; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1122,10 +1124,12 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1147,7 +1151,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378]
 ; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3
+; CHECK-SSE41-NEXT:    pand %xmm3, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1168,7 +1173,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; CHECK-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -1238,10 +1244,12 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1263,7 +1271,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3
+; CHECK-SSE41-NEXT:    pand %xmm3, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1284,7 +1293,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; CHECK-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -2211,59 +2221,61 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE2-LABEL: pr51133:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    movq %rdi, %rax
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,0,41,183,1,1,161,221]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,103,183,171,61,1,127,183]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm6
-; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT:    movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm5
-; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm6
-; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE2-NEXT:    pminub %xmm6, %xmm7
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm7
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm7
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm6, %xmm6
-; CHECK-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
-; CHECK-SSE2-NEXT:    pandn %xmm1, %xmm5
-; CHECK-SSE2-NEXT:    por %xmm7, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+5(%rip), %ecx
+; CHECK-SSE2-NEXT:    movd %ecx, %xmm6
+; CHECK-SSE2-NEXT:    psllq $40, %xmm6
+; CHECK-SSE2-NEXT:    pand %xmm1, %xmm6
 ; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [9,0,41,183,1,1,161,221]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [171,103,183,171,61,1,127,183]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,1,128,1,128,32,1,1]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,64,2,1,32]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm4
+; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
+; CHECK-SSE2-NEXT:    pminub %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pcmpgtb %xmm1, %xmm6
+; CHECK-SSE2-NEXT:    pandn %xmm6, %xmm4
+; CHECK-SSE2-NEXT:    por %xmm7, %xmm4
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [223,223,205,183,161,1,171,239]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm6
 ; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,205,27,241,1,1,1,163]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm0
+; CHECK-SSE2-NEXT:    packuswb %xmm6, %xmm0
 ; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,128,1,1,1,128,1,64]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm5
 ; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32]
 ; CHECK-SSE2-NEXT:    psrlw $8, %xmm0
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
-; CHECK-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm3
-; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm2
-; CHECK-SSE2-NEXT:    pandn %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
+; CHECK-SSE2-NEXT:    pmaxub %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm2
 ; CHECK-SSE2-NEXT:    pmovmskb %xmm2, %ecx
 ; CHECK-SSE2-NEXT:    pmovmskb %xmm3, %edx
 ; CHECK-SSE2-NEXT:    shll $16, %edx
@@ -2474,7 +2486,9 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpinsrb $5, {{\.?LCPI[0-9]+_[0-9]+}}+21(%rip), %xmm0, %xmm4
+; CHECK-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; CHECK-AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll
index 7b4bd3ffdf00c51..b8873e3839cd2b7 100644
--- a/llvm/test/CodeGen/X86/sse-align-12.ll
+++ b/llvm/test/CodeGen/X86/sse-align-12.ll
@@ -54,8 +54,8 @@ define <2 x double> @c(ptr %y) nounwind {
 define <2 x double> @d(ptr %y, <2 x double> %z) nounwind {
 ; CHECK-LABEL: d:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movups (%rdi), %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
 ; CHECK-NEXT:    retq
   %x = load <2 x double>, ptr %y, align 8
   %a = extractelement <2 x double> %x, i32 1
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index cf5f527b16114fa..67c0f393c086056 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -392,19 +392,19 @@ define <2 x double> @test11(double %a, double %b) nounwind {
 define void @test12() nounwind {
 ; SSE-LABEL: test12:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd 0, %xmm0
-; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps %xmm2, 0
+; SSE-NEXT:    movaps 0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT:    addps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, 0
 ; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test12:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovaps 0, %xmm0
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -529,29 +529,45 @@ define <4 x float> @test15(ptr %x, ptr %y) nounwind {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
-; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
-; X86-AVX-LABEL: test15:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
-; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test15:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test15:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX512-NEXT:    vunpcklpd 8(%eax){1to2}, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test15:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movaps (%rdi), %xmm0
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: test15:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test15:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X64-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test15:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX512-NEXT:    vunpcklpd 8(%rsi){1to2}, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
 entry:
   %tmp = load <4 x float>, ptr %y             ; <<4 x float>> [#uses=1]
   %tmp3 = load <4 x float>, ptr %x            ; <<4 x float>> [#uses=1]
@@ -565,27 +581,27 @@ define  <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) {
 ; X86-SSE-LABEL: test16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
-; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test16:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovaps 96(%eax), %xmm0
-; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test16:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
-; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test16:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovaps 96(%rdi), %xmm0
-; X64-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-AVX-NEXT:    retq
   %i5 = getelementptr inbounds <4 x double>, ptr %srcA, i32 3
   %i6 = load <4 x double>, ptr %i5, align 32
@@ -700,8 +716,3 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
   %m = mul <4 x i32> %x, %y
   ret <4 x i32> %m
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX1: {{.*}}
-; X64-AVX512: {{.*}}
-; X86-AVX1: {{.*}}
-; X86-AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
index 1a4df9a175ffad3..2d3008d980a6dad 100644
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -39,20 +39,22 @@ define <8 x i16> @t1(ptr %A, ptr %B) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-NEXT:    movaps %xmm0, %xmm1
-; X86-NEXT:    andnps (%ecx), %xmm1
-; X86-NEXT:    andps (%eax), %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    movaps (%eax), %xmm2
+; X86-NEXT:    andps %xmm0, %xmm2
+; X86-NEXT:    andnps %xmm1, %xmm0
+; X86-NEXT:    orps %xmm2, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t1:
 ; X64:       # %bb.0:
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X64-NEXT:    movaps %xmm0, %xmm1
-; X64-NEXT:    andnps (%rsi), %xmm1
-; X64-NEXT:    andps (%rdi), %xmm0
-; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    movaps (%rdi), %xmm2
+; X64-NEXT:    andps %xmm0, %xmm2
+; X64-NEXT:    andnps %xmm1, %xmm0
+; X64-NEXT:    orps %xmm2, %xmm0
 ; X64-NEXT:    retq
 	%tmp1 = load <8 x i16>, ptr %A
 	%tmp2 = load <8 x i16>, ptr %B
@@ -395,14 +397,14 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, ptr undef, align 16
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d09b..07d7f82d8090679 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -560,46 +560,40 @@ define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, ptr nocapture
 ; X86-SSE-LABEL: insertps_from_shufflevector_1:
 ; X86-SSE:       ## %bb.0: ## %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_shufflevector_1:
 ; X86-AVX1:       ## %bb.0: ## %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_shufflevector_1:
 ; X86-AVX512:       ## %bb.0: ## %entry
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_shufflevector_1:
 ; X64-SSE:       ## %bb.0: ## %entry
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_shufflevector_1:
 ; X64-AVX1:       ## %bb.0: ## %entry
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_shufflevector_1:
 ; X64-AVX512:       ## %bb.0: ## %entry
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
 entry:
   %0 = load <4 x float>, ptr %pb, align 16
@@ -636,8 +630,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read
 ; X86-SSE-LABEL: pinsrd_from_shufflevector_i32:
 ; X86-SSE:       ## %bb.0: ## %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00]
-; X86-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X86-SSE-NEXT:    movd (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0x66,0x0f,0x6e,0x08]
+; X86-SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
+; X86-SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
 ; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
 ; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
@@ -660,8 +656,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read
 ;
 ; X64-SSE-LABEL: pinsrd_from_shufflevector_i32:
 ; X64-SSE:       ## %bb.0: ## %entry
-; X64-SSE-NEXT:    pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00]
-; X64-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-SSE-NEXT:    movd (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0x66,0x0f,0x6e,0x0f]
+; X64-SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
+; X64-SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
 ; X64-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
 ; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
@@ -1372,46 +1370,40 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture read
 ; X86-SSE-LABEL: insertps_from_vector_load:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = load <4 x float>, ptr %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -1424,46 +1416,40 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocaptu
 ; X86-SSE-LABEL: insertps_from_vector_load_offset:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-SSE-NEXT:    insertps $32, 4(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x04,0x20]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load_offset:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-AVX1-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load_offset:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-AVX512-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load_offset:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-SSE-NEXT:    insertps $32, 4(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x04,0x20]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load_offset:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-AVX1-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load_offset:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-AVX512-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = load <4 x float>, ptr %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -1477,9 +1463,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
-; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
-; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-SSE-NEXT:    movss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x08,0x0c]
+; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
@@ -1487,9 +1474,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
-; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-AVX1-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c]
+; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
@@ -1497,33 +1485,37 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
-; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-AVX512-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c]
+; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load_offset_2:
 ; X64-SSE:       ## %bb.0:
 ; X64-SSE-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-SSE-NEXT:    movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37]
-; X64-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
-; X64-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-SSE-NEXT:    movss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x37,0x0c]
+; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load_offset_2:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-AVX1-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37]
-; X64-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X64-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-AVX1-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c]
+; X64-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load_offset_2:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-AVX512-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37]
-; X64-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X64-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-AVX512-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c]
+; X64-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
   %2 = load <4 x float>, ptr %1, align 16
@@ -1587,9 +1579,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt
 ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1608,9 +1599,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1819,46 +1809,40 @@ define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) {
 ; X86-SSE-LABEL: pr20087:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-SSE-NEXT:    insertps $50, 8(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x08,0x32]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: pr20087:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-AVX1-NEXT:    vinsertps $50, 8(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: pr20087:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-AVX512-NEXT:    vinsertps $50, 8(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: pr20087:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-SSE-NEXT:    insertps $50, 8(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x08,0x32]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: pr20087:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-AVX1-NEXT:    vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: pr20087:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-AVX512-NEXT:    vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %load = load <4 x float> , ptr%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
diff --git a/llvm/test/CodeGen/X86/strict-fsub-combines.ll b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
index 774ea02ccd87a44..be491bc330129ef 100644
--- a/llvm/test/CodeGen/X86/strict-fsub-combines.ll
+++ b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
@@ -8,9 +8,10 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    subss %xmm1, %xmm0
+; X86-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    xorps %xmm1, %xmm2
+; X86-NEXT:    subss %xmm2, %xmm0
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    wait
@@ -19,8 +20,9 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric
 ;
 ; X64-LABEL: fneg_strict_fsub_to_strict_fadd:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    subss %xmm1, %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm2 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    xorps %xmm1, %xmm2
+; X64-NEXT:    subss %xmm2, %xmm0
 ; X64-NEXT:    retq
   %neg = fneg float %y
   %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %neg, metadata!"round.dynamic", metadata!"fpexcept.strict")
@@ -48,8 +50,9 @@ define double @fneg_strict_fsub_to_strict_fadd_d(double %x, double %y) nounwind
 ;
 ; X64-LABEL: fneg_strict_fsub_to_strict_fadd_d:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    subsd %xmm1, %xmm0
+; X64-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    xorpd %xmm1, %xmm2
+; X64-NEXT:    subsd %xmm2, %xmm0
 ; X64-NEXT:    retq
   %neg = fneg double %y
   %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %neg, metadata!"round.dynamic", metadata!"fpexcept.strict")
@@ -63,8 +66,9 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    xorps %xmm0, %xmm1
+; X86-NEXT:    movss %xmm1, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
@@ -73,7 +77,8 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric
 ; X64-LABEL: strict_fsub_fneg_to_strict_fsub:
 ; X64:       # %bb.0:
 ; X64-NEXT:    subss %xmm1, %xmm0
-; X64-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    xorps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata!"round.dynamic", metadata!"fpexcept.strict")
   %neg = fneg float %sub
@@ -101,7 +106,8 @@ define double @strict_fsub_fneg_to_strict_fsub_d(double %x, double %y) nounwind
 ; X64-LABEL: strict_fsub_fneg_to_strict_fsub_d:
 ; X64:       # %bb.0:
 ; X64-NEXT:    subsd %xmm1, %xmm0
-; X64-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    xorpd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata!"round.dynamic", metadata!"fpexcept.strict")
   %neg = fneg double %sub
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 76183ac5f8fa3e4..8d227493f3bbb87 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1647,13 +1647,13 @@ define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default)
 ; X86-LABEL: broadcast_v4f64_v2f64_4u61:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vinsertf128 $1, (%eax), %ymm0, %ymm1
+; X86-NEXT:    vbroadcastsd 8(%eax), %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
 ; X64:       # %bb.0:
-; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
+; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; X64-NEXT:    retq
   %vec = load <2 x double>, ptr %vp
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 953a0d65c5386c1..029c76a9f3ad372 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -66,8 +66,11 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X64-LABEL: fail:
 ; CHECK-X64:       # %bb.0:
 ; CHECK-X64-NEXT:    pslld $8, %xmm0
-; CHECK-X64-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-X64-NEXT:    pextrw $1, %xmm0, %eax
+; CHECK-X64-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+2(%rip), %eax
+; CHECK-X64-NEXT:    movd %eax, %xmm1
+; CHECK-X64-NEXT:    pslld $16, %xmm1
+; CHECK-X64-NEXT:    pcmpeqb %xmm0, %xmm1
+; CHECK-X64-NEXT:    pextrw $1, %xmm1, %eax
 ; CHECK-X64-NEXT:    xorb $1, %al
 ; CHECK-X64-NEXT:    testl $263, %edi # imm = 0x107
 ; CHECK-X64-NEXT:    setne %cl
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index 5ea991f85523ea0..e78b5e19c5dc2ff 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -663,70 +663,20 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKLPDrm:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKLPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrm:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrm:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
@@ -848,37 +798,43 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-V4:       # %bb.0:
 ; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
@@ -888,41 +844,11 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
 }
 
 define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -1060,42 +986,48 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-V4:       # %bb.0:
 ; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
@@ -1106,47 +1038,12 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
 }
 
 define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrmk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
index 6940c33c9d327d6..ff02527b05d2dd4 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
@@ -163,30 +163,10 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKLPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
@@ -195,34 +175,30 @@ define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
 ; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+; CHECK-ICX-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX:       # %bb.0:
+; CHECK-ICX-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
+; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
-; CHECK-ICX: {{.*}}
 ; CHECK-SKL: {{.*}}
 ; CHECK-V3: {{.*}}
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 36094fe56d57740..b36592bf90ae792 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -243,7 +243,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vmovq {{.*#+}} xmm1 = [9223372036854775808,0]
+; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
 ; CHECK-AVX1-NEXT:    vmovq %rax, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -262,7 +263,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [9223372036854775808,0]
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
 ; CHECK-AVX2-NEXT:    vmovq %rax, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 91743898545ee1f..ace118ee17fada5 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -34,18 +34,18 @@ define <4 x float> @t2(ptr %P) nounwind {
 ; X86-LABEL: t2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
+; X86-NEXT:    pxor %xmm0, %xmm0
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t2:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
@@ -56,14 +56,12 @@ define <4 x float> @t3(ptr %P) nounwind {
 ; X86-LABEL: t3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t3:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
@@ -74,18 +72,12 @@ define <4 x float> @t4(ptr %P) nounwind {
 ; X86-LABEL: t4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t4:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
@@ -96,27 +88,13 @@ define <4 x float> @t4_under_aligned(ptr %P) nounwind {
 ; X86-LABEL: t4_under_aligned:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movups (%eax), %xmm0
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    retl
 ;
-; ALIGN-LABEL: t4_under_aligned:
-; ALIGN:       # %bb.0:
-; ALIGN-NEXT:    movups (%rdi), %xmm0
-; ALIGN-NEXT:    xorps %xmm1, %xmm1
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; ALIGN-NEXT:    retq
-;
-; UNALIGN-LABEL: t4_under_aligned:
-; UNALIGN:       # %bb.0:
-; UNALIGN-NEXT:    xorps %xmm1, %xmm1
-; UNALIGN-NEXT:    xorps %xmm0, %xmm0
-; UNALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; UNALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
-; UNALIGN-NEXT:    retq
+; X64-LABEL: t4_under_aligned:
+; X64:       # %bb.0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P, align 4
   %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
   ret <4 x float> %tmp2
@@ -191,3 +169,6 @@ define <16 x i8> @t9(<16 x i8> %x) nounwind {
   %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
   ret <16 x i8> %s
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALIGN: {{.*}}
+; UNALIGN: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index af841cf38b24aec..c524e8956f79085 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4119,26 +4119,14 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) {
 define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB83_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB83_3
-; SSE2-NEXT:  .LBB83_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB83_3:
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB83_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB83_6
+; SSE2-NEXT:    jns .LBB83_5
 ; SSE2-NEXT:  .LBB83_4:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4146,6 +4134,18 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    jmp .LBB83_6
+; SSE2-NEXT:  .LBB83_1:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB83_4
+; SSE2-NEXT:  .LBB83_5:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:  .LBB83_6:
 ; SSE2-NEXT:    movq (%rdi), %rax
 ; SSE2-NEXT:    movq 8(%rdi), %rcx
@@ -4448,26 +4448,14 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB87_3
-; SSE2-NEXT:  .LBB87_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB87_3:
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB87_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB87_6
+; SSE2-NEXT:    jns .LBB87_5
 ; SSE2-NEXT:  .LBB87_4:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4475,6 +4463,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    jmp .LBB87_6
+; SSE2-NEXT:  .LBB87_1:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_4
+; SSE2-NEXT:  .LBB87_5:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:  .LBB87_6:
 ; SSE2-NEXT:    movq (%rdi), %rax
 ; SSE2-NEXT:    movq 8(%rdi), %rcx
@@ -4504,26 +4504,14 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:  .LBB87_11:
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:  .LBB87_12:
-; SSE2-NEXT:    movq 56(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_13
 ; SSE2-NEXT:  # %bb.14:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    jmp .LBB87_15
-; SSE2-NEXT:  .LBB87_13:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
-; SSE2-NEXT:  .LBB87_15:
-; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB87_16
-; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    jmp .LBB87_18
+; SSE2-NEXT:    jns .LBB87_17
 ; SSE2-NEXT:  .LBB87_16:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4531,28 +4519,40 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    addss %xmm4, %xmm4
+; SSE2-NEXT:    jmp .LBB87_18
+; SSE2-NEXT:  .LBB87_13:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_16
+; SSE2-NEXT:  .LBB87_17:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:  .LBB87_18:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movq 40(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 32(%rdi), %rax
+; SSE2-NEXT:    movq 40(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_19
 ; SSE2-NEXT:  # %bb.20:
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    jmp .LBB87_21
 ; SSE2-NEXT:  .LBB87_19:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_21:
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT:    movq 32(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_22
 ; SSE2-NEXT:  # %bb.23:
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index 2ab00ea96ada1fe..5f8da88eb354d5a 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -215,7 +215,7 @@ define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) {
 define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){
 ; CHECK-LABEL: extelt0_sub_pslli_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; CHECK-NEXT:    movd {{.*#+}} xmm2 = [32,0,0,0]
 ; CHECK-NEXT:    psubd %xmm1, %xmm2
 ; CHECK-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
@@ -228,23 +228,15 @@ define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){
 }
 
 define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){
-; X86-LABEL: extelt1_add_psrli_v4i32:
-; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    xorps %xmm2, %xmm2
-; X86-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X86-NEXT:    psrld %xmm2, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: extelt1_add_psrli_v4i32:
-; X64:       # %bb.0:
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X64-NEXT:    psrld %xmm2, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: extelt1_add_psrli_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; CHECK-NEXT:    movd {{.*#+}} xmm2 = [3,0,0,0]
+; CHECK-NEXT:    paddd %xmm1, %xmm2
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; CHECK-NEXT:    psrld %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <4 x i32> %y, i64 1
   %bo = add i32 %ext, 3
   %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo)
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 5dcf19013f0b7c4..45c3b73a9948c87 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -54,8 +54,9 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ;
 ; XOP-LABEL: test_bitreverse_i8:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovd %edi, %xmm0
-; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vmovd %edi, %xmm1
+; XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    # kill: def $al killed $al killed $eax
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
index 9c80720ae921a8e..818405349b71614 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
@@ -1,30 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -stop-after=finalize-isel < %s | FileCheck %s
 
 define <1 x float> @constrained_vector_fadd_v1f32() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v1f32
-; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: $xmm0 = COPY [[ADDSSrm]]
-; CHECK: RET 0, $xmm0
 entry:
   %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float> <float 0x7FF0000000000000>, <1 x float> <float 1.0>, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <1 x float> %add
 }
 
 define <3 x float> @constrained_vector_fadd_v3f32() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v3f32
-; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS
-; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrr:%[0-9]+]]:fr32 = ADDSSrr [[MOVSSrm_alt]], killed [[FsFLD0SS]], implicit $mxcsr
-; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrm1:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY [[ADDSSrm1]]
-; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY [[ADDSSrm]]
-; CHECK: [[UNPCKLPSrr:%[0-9]+]]:vr128 = UNPCKLPSrr [[COPY1]], killed [[COPY]]
-; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[ADDSSrr]]
-; CHECK: [[UNPCKLPDrr:%[0-9]+]]:vr128 = UNPCKLPDrr [[UNPCKLPSrr]], killed [[COPY2]]
-; CHECK: $xmm0 = COPY [[UNPCKLPDrr]]
-; CHECK: RET 0, $xmm0
 entry:
   %add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -36,13 +19,6 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fadd_v4f64() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v4f64
-; CHECK: [[MOVAPDrm:%[0-9]+]]:vr128 = MOVAPDrm $rip, 1, $noreg, %const.0, $noreg :: (load (s128) from constant-pool)
-; CHECK: [[ADDPDrm:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
-; CHECK: [[ADDPDrm1:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
-; CHECK: $xmm0 = COPY [[ADDPDrm1]]
-; CHECK: $xmm1 = COPY [[ADDPDrm]]
-; CHECK: RET 0, $xmm0, $xmm1
 entry:
   %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -59,3 +35,5 @@ attributes #0 = { strictfp }
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 49062eaef31887e..f25267f3dbd53cf 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -6319,31 +6319,28 @@ define <3 x double> @constrained_vector_round_v3f64_var(ptr %a) #0 {
 ;
 ; AVX-LABEL: constrained_vector_round_v3f64_var:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    subq $48, %rsp
-; AVX-NEXT:    .cfi_def_cfa_offset 64
-; AVX-NEXT:    .cfi_offset %rbx, -16
-; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    subq $72, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 80
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    callq round at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; AVX-NEXT:    # xmm0 = mem[0],zero
 ; AVX-NEXT:    callq round at PLT
-; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq round at PLT
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    addq $48, %rsp
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    addq $72, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 193e570c5f9a877..a41f0366882152f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -999,17 +999,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
-; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1087,17 +1089,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 1d807fa85ddc5c2..d1a77c0b543ab09 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -544,6 +544,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -562,6 +564,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -622,6 +626,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; AVX512F-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX512F-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
@@ -641,24 +647,25 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 9c259ed38321d0b..58fb5ebc1d61289 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1034,6 +1034,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
@@ -1123,6 +1125,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 665223167fbb4d3..9c4c1ff78939604 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -546,6 +546,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -564,6 +566,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -626,6 +630,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; AVX512F-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX512F-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -646,25 +652,26 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index eafee9e65345f33..07d15b834452a86 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -216,25 +216,15 @@ define double @test_v2f64(<2 x double> %a0) {
 }
 
 define double @test_v3f64(<3 x double> %a0) {
-; SSE2-LABEL: test_v3f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
-; SSE2-NEXT:    maxpd %xmm2, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    maxsd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v3f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
-; SSE41-NEXT:    maxpd %xmm2, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    maxsd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v3f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE-NEXT:    maxpd %xmm2, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    maxsd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v3f64:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 5ae9e552d0dcda4..c21959e2fbabed1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -635,7 +635,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; SSE2-LABEL: test_v3f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
+; SSE2-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; SSE2-NEXT:    movapd %xmm2, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
 ; SSE2-NEXT:    cmpunordpd %xmm0, %xmm0
@@ -656,7 +656,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; SSE41-LABEL: test_v3f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
+; SSE41-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; SSE41-NEXT:    movapd %xmm2, %xmm1
 ; SSE41-NEXT:    minpd %xmm0, %xmm1
 ; SSE41-NEXT:    cmpunordpd %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index b114cba14cb6c76..fb5df6933ccf4f0 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -660,29 +660,19 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 
 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
-; SSE2-LABEL: splatvar_rotate_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllq %xmm1, %xmm3
-; SSE2-NEXT:    psrlq %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: splatvar_rotate_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [64,64]
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllq %xmm1, %xmm3
-; SSE41-NEXT:    psrlq %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: splatvar_rotate_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [64,0]
+; SSE-NEXT:    psubq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psllq %xmm1, %xmm3
+; SSE-NEXT:    psrlq %xmm2, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_rotate_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [64,0]
 ; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
 ; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 86c4d79a28c891b..49ad50a99451535 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -517,7 +517,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_rotate_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [64,0]
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm4
@@ -532,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-LABEL: splatvar_rotate_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [64,64]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = [64,0]
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 54056461bff8ce0..4373620d130ebf6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -936,17 +936,19 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE-NEXT:    psrlq %xmm1, %xmm2
-; SSE-NEXT:    psrlq %xmm1, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    psubq %xmm2, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; SSE-NEXT:    psrlq %xmm2, %xmm1
+; SSE-NEXT:    psrlq %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubq %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
@@ -957,7 +959,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
@@ -967,7 +970,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -976,8 +980,9 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
@@ -986,7 +991,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
@@ -1160,17 +1166,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
@@ -1178,9 +1186,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1188,9 +1197,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1199,9 +1209,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -1209,9 +1220,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index abd81a0e9f99a04..5afb48bea3f503e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1010,7 +1010,8 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
@@ -1026,7 +1027,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -1036,7 +1038,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -1048,7 +1051,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -1059,7 +1063,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    retq
@@ -1265,9 +1270,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
@@ -1277,8 +1283,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1304,9 +1311,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1327,9 +1335,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 74dbee5e5d2ca72..67ff078014e0242 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -233,7 +233,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 467c1574180da15..1efbf0d0f0ca4c3 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -771,25 +771,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psrlq %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_modulo_shift_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -941,17 +945,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
@@ -959,9 +965,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -969,9 +976,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -980,9 +988,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQVL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -990,9 +999,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BWVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index ca303b4c7ebf677..64db9c6d33f9bb9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -823,8 +823,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -832,14 +833,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -847,13 +850,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
@@ -1037,9 +1042,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
@@ -1049,8 +1055,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1072,9 +1079,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1092,9 +1100,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BWVL-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a13..6640dfb13f4d1be 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -188,7 +188,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 4dda9ff09cc62dc..8408179ebee0760 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -678,25 +678,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psllq %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psllq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_modulo_shift_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -848,24 +852,27 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -873,9 +880,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -884,9 +892,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQVL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -894,9 +903,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BWVL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index c80f24ad5777301..7f0e3388944e037 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -748,8 +748,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -757,14 +758,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -772,13 +775,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
@@ -962,9 +967,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
@@ -973,9 +979,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -993,9 +1000,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1012,9 +1020,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BWVL-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index a42056be895e7ac..540bab0cdc33a60 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -181,7 +181,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d389f981722946..fdab5b797f3faa2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem_v2f64(ptr %ptr) {
 define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind {
 ; SSE2-LABEL: insert_dup_mem128_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
@@ -1308,7 +1308,7 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) {
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_02:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-NEXT:    retq
   %b = load <2 x double>, ptr %pb, align 1
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -1316,30 +1316,14 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) {
 }
 
 define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) {
-; SSE2-LABEL: shuffle_mem_v2f64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_mem_v2f64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_mem_v2f64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_mem_v2f64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_mem_v2f64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_21:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    retq
   %b = load <2 x double>, ptr %pb, align 1
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index a79b109feec72b0..428dffcdda576a3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2463,7 +2463,7 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_0145:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-NEXT:    retq
   %b = load <4 x float>, ptr %pb, align 1
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -2471,30 +2471,14 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
 }
 
 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
-; SSE2-LABEL: shuffle_mem_v4f32_4523:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_mem_v4f32_4523:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_mem_v4f32_4523:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_mem_v4f32_4523:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_mem_v4f32_4523:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_4523:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    retq
   %b = load <4 x float>, ptr %pb, align 1
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -2525,23 +2509,46 @@ define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
 }
 
 define  <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) {
-; SSE-LABEL: shuffle_mem_v4f32_4760:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_mem_v4f32_4760:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_mem_v4f32_4760:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_mem_v4f32_4760:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_mem_v4f32_4760:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE41-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,3,2,4]
-; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,3,2,4]
+; AVX512VL-NEXT:    vpermt2ps %xmm1, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
   %1 = load <4 x float>, ptr %a1
   %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index ec54b755135829e..feefa8fb875e54b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -914,16 +914,15 @@ define void @PR63030(ptr %p0) {
 ;
 ; X64-AVX2-LABEL: PR63030:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; X64-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [3,3]
-; X64-AVX2-NEXT:    # xmm1 = mem[0,0]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [3,2]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; X64-AVX2-NEXT:    vmovaps %ymm1, (%rax)
+; X64-AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [3,3]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[1,1,0,0]
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; X64-AVX2-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 5c035346415b0ed..514523efef2a9a5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -796,21 +796,26 @@ define <16 x i8> @constant_fold_pshufb_2() {
 define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
 ; SSSE3-LABEL: mask_zzz3_v16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+3(%rip), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    pslld $24, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    movd %xmm0, %eax
 ; SSSE3-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mask_zzz3_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    pinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pextrd $3, %xmm0, %eax
 ; SSE41-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mask_zzz3_v16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpextrd $3, %xmm0, %eax
 ; AVX-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 07c770abc65d6c4..a7210cffc80c08f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test1c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test1c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1810,17 +1818,21 @@ define <4 x i8> @combine_test3c(ptr %a, ptr %b) {
 define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ; SSE2-LABEL: combine_test4c:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    andps %xmm0, %xmm2
-; SSE2-NEXT:    andnps %xmm1, %xmm0
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movzbl 1(%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test4c:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movzbl 1(%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    psllw $8, %xmm1
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1828,20 +1840,32 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ;
 ; SSE41-LABEL: combine_test4c:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movzbl 1(%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test4c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test4c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzbl 1(%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -2821,16 +2845,16 @@ define <4 x float> @PR30264(<4 x float> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: PR30264:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
-; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: PR30264:
@@ -3051,17 +3075,18 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    pextrw $7, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movsbl (%rsi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movsbl (%rdx), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65531,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movsbl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movsbl (%rdx), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
@@ -3077,7 +3102,8 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = [65531,0,0,0]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
@@ -3555,14 +3581,15 @@ define void @SpinningCube() {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
-; SSE2-NEXT:    addps %xmm3, %xmm1
-; SSE2-NEXT:    movaps %xmm1, (%rax)
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, (%rax)
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -3574,14 +3601,15 @@ define void @SpinningCube() {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSSE3-NEXT:    xorps %xmm3, %xmm3
-; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
-; SSSE3-NEXT:    addps %xmm3, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, (%rax)
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm3
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
+; SSSE3-NEXT:    addps %xmm1, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, (%rax)
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
 ; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 2b89590a0bb419d..ecdea22d7b5a4de 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
@@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm2 = [18446744073709551615,0]
 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
-; VL_BW_DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index e83c1e84827737b..fea59d965761272 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -69,7 +69,7 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (mem & (ymm2 ^ ymm0))
 ; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
@@ -83,7 +83,7 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm1 & mem)
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -214,7 +214,7 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem)
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
@@ -228,7 +228,7 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: f2:
@@ -344,7 +344,7 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512F-NEXT:    vpternlogq $216, %ymm5, %ymm2, %ymm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm5 & (ymm0 ^ ymm2))
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm6
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
@@ -369,7 +369,7 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vpternlogq $226, %ymm1, %ymm5, %ymm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm5 & (ymm2 ^ ymm1))
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -497,7 +497,7 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem)
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
@@ -511,7 +511,7 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: f4:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index 0efbe018764d281..ec81ecf6faa8577 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -35,7 +35,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqu 32(%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
@@ -53,7 +53,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
@@ -65,7 +65,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movl $63488, %eax # imm = 0xF800
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index be6ee8f6899584f..494d216dfa0ed4d 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -761,19 +761,31 @@ define void @vselect_allzeros_LHS_multiple_use_setcc(<4 x i32> %x, <4 x i32> %y,
 ; This test case previously crashed after r363802, r363850, and r363856 due
 ; any_extend_vector_inreg not being handled by the X86 backend.
 define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
-; SSE-LABEL: vselect_any_extend_vector_inreg_crash:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $15, %eax
-; SSE-NEXT:    retq
+; SSE2-LABEL: vselect_any_extend_vector_inreg_crash:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [49,0,0,0]
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    shll $15, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: vselect_any_extend_vector_inreg_crash:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    pinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    shll $15, %eax
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vselect_any_extend_vector_inreg_crash:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    shll $15, %eax
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
index 8e47ed67bdcffcd..3c8d358413eca85 100644
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -140,23 +140,14 @@ entry:
 define <8 x i16> @load_splat_8i16_8i16_01010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_8i16_8i16_01010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i16_8i16_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i16_8i16_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i16_8i16_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i16_8i16_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i16>, ptr %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -192,7 +183,8 @@ entry:
 define <16 x i16> @load_splat_16i16_8i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -226,7 +218,8 @@ entry:
 define <16 x i16> @load_splat_16i16_16i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i16_16i16_0101010101010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -288,23 +281,14 @@ entry:
 define <16 x i8> @load_splat_16i8_16i8_0123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i8_16i8_0123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i8_16i8_0123012301230123:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i8>, ptr %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -347,7 +331,8 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(ptr %ptr
 ;
 ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -370,7 +355,8 @@ entry:
 define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -411,7 +397,8 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(ptr %ptr
 ;
 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -434,7 +421,8 @@ entry:
 define <32 x i8> @load_splat_32i8_32i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -468,7 +456,7 @@ entry:
 define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_4f32_8f32_0000:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 3e76bffb77a665c..91831d2326bbbd3 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -53,10 +53,12 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovupd (%rdi), %ymm0
 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX1-NEXT:    vbroadcastsd 88(%rdi), %ymm2
+; AVX1-NEXT:    vbroadcastsd 120(%rdi), %ymm3
+; AVX1-NEXT:    vmovsd 64(%rdi), %xmm4 # xmm4 = mem[0],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX1-NEXT:    vmovsd 96(%rdi), %xmm5 # xmm5 = mem[0],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm5
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -88,21 +90,21 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
 define <4 x double> @load_factorf64_1(ptr %ptr) nounwind {
 ; AVX1-LABEL: load_factorf64_1:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
-; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX1-NEXT:    vmovhps 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT:    vmovhps 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512-LABEL: load_factorf64_1:
 ; AVX2OR512:       # %bb.0:
-; AVX2OR512-NEXT:    vmovupd (%rdi), %ymm0
-; AVX2OR512-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2OR512-NEXT:    vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1]
-; AVX2OR512-NEXT:    vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1]
-; AVX2OR512-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2OR512-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX2OR512-NEXT:    vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX2OR512-NEXT:    vmovhpd 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
+; AVX2OR512-NEXT:    vmovhpd 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX2OR512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2OR512-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX2OR512-NEXT:    retq
   %wide.vec = load <16 x double>, ptr %ptr, align 16
@@ -1873,8 +1875,9 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
 define <2 x i64> @PR37616(ptr %a0) nounwind {
 ; AVX-LABEL: PR37616:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovsd 48(%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd 16(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
   %load = load <16 x i64>, ptr %a0, align 128
   %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll
index 83dcf9ce0d1e90d..1512a488846a8ae 100644
--- a/llvm/test/CodeGen/X86/xop-shifts.ll
+++ b/llvm/test/CodeGen/X86/xop-shifts.ll
@@ -8,9 +8,12 @@
 define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: demandedelts_vpshab:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1)
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 2bef66825d8c02e..f5c879b2011cd13 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -405,8 +405,11 @@ define i32 @PR17487(i1 %tobool) {
 ; X64-LIN:       # %bb.0:
 ; X64-LIN-NEXT:    movd %edi, %xmm0
 ; X64-LIN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X64-LIN-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-LIN-NEXT:    pextrw $4, %xmm0, %eax
+; X64-LIN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-LIN-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X64-LIN-NEXT:    pand %xmm0, %xmm1
+; X64-LIN-NEXT:    pextrw $4, %xmm1, %eax
+; X64-LIN-NEXT:    movzbl %al, %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: PR17487:
@@ -414,8 +417,11 @@ define i32 @PR17487(i1 %tobool) {
 ; X64-WIN-NEXT:    movzbl %cl, %eax
 ; X64-WIN-NEXT:    movd %eax, %xmm0
 ; X64-WIN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X64-WIN-NEXT:    pand __xmm at 00000000000000010000000000000001(%rip), %xmm0
-; X64-WIN-NEXT:    pextrw $4, %xmm0, %eax
+; X64-WIN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-WIN-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X64-WIN-NEXT:    pand %xmm0, %xmm1
+; X64-WIN-NEXT:    pextrw $4, %xmm1, %eax
+; X64-WIN-NEXT:    movzbl %al, %eax
 ; X64-WIN-NEXT:    retq
   %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1
   %tmp1 = zext <2 x i1> %tmp to <2 x i64>



More information about the llvm-branch-commits mailing list