[llvm] [MCP] Move dependencies if they block copy propagation (PR #105562)

Gábor Spaits via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 30 08:29:11 PDT 2024


https://github.com/spaits updated https://github.com/llvm/llvm-project/pull/105562

>From ee39a4154ab84763d141e02a31b4d085ee1babd2 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Wed, 21 Aug 2024 19:59:34 +0200
Subject: [PATCH 01/15] [MCP] Move unnecesary dependencies if they block copy
 propagation

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   |  278 +-
 .../AArch64/GlobalISel/arm64-atomic.ll        |   10 +-
 .../AArch64/GlobalISel/arm64-pcsections.ll    |  196 +-
 .../GlobalISel/merge-stores-truncating.ll     |    3 +-
 llvm/test/CodeGen/AArch64/aarch64-mulv.ll     |    3 +-
 llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll |   10 +-
 llvm/test/CodeGen/AArch64/addp-shuffle.ll     |    6 +-
 .../CodeGen/AArch64/anti-dependencies-mcp.mir |  201 ++
 .../CodeGen/AArch64/arm64-non-pow2-ldst.ll    |    8 +-
 .../CodeGen/AArch64/arm64-subvector-extend.ll |  102 +-
 .../CodeGen/AArch64/arm64-windows-calls.ll    |    3 +-
 llvm/test/CodeGen/AArch64/avoid-zero-copy.mir |    3 +
 llvm/test/CodeGen/AArch64/cgp-usubo.ll        |   15 +-
 llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll   |   12 +-
 .../CodeGen/AArch64/duplane-index-patfrags.ll |   12 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |   18 +-
 llvm/test/CodeGen/AArch64/fexplog.ll          |  510 ++--
 llvm/test/CodeGen/AArch64/fpext.ll            |   46 +-
 llvm/test/CodeGen/AArch64/fpow.ll             |   56 +-
 llvm/test/CodeGen/AArch64/fpowi.ll            |  102 +-
 llvm/test/CodeGen/AArch64/frem.ll             |   56 +-
 llvm/test/CodeGen/AArch64/fsincos.ll          |  204 +-
 .../test/CodeGen/AArch64/ldrpre-ldr-merge.mir |  152 +-
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |   18 +-
 llvm/test/CodeGen/AArch64/load.ll             |    3 +-
 .../AArch64/lr-reserved-for-ra-live-in.ll     |    4 +-
 .../CodeGen/AArch64/machine-cp-sub-reg.mir    |    6 +-
 .../AArch64/machine-sink-kill-flags.ll        |    5 +-
 .../AArch64/named-vector-shuffles-neon.ll     |    6 +-
 llvm/test/CodeGen/AArch64/neon-extadd.ll      |   54 +-
 llvm/test/CodeGen/AArch64/neon-extmul.ll      |   10 +-
 llvm/test/CodeGen/AArch64/neon-perm.ll        |    3 +-
 llvm/test/CodeGen/AArch64/sext.ll             |   81 +-
 llvm/test/CodeGen/AArch64/shufflevector.ll    |   17 +-
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   |  112 +-
 .../streaming-compatible-memory-ops.ll        |    5 +-
 llvm/test/CodeGen/AArch64/sve-sext-zext.ll    |   27 +-
 .../sve-streaming-mode-fixed-length-trunc.ll  |    4 +-
 .../AArch64/sve-vector-deinterleave.ll        |   15 +-
 .../CodeGen/AArch64/sve-vector-interleave.ll  |    6 +-
 llvm/test/CodeGen/AArch64/vec_umulo.ll        |   14 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |   49 +-
 llvm/test/CodeGen/AArch64/vselect-ext.ll      |   17 +-
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      |   18 +-
 llvm/test/CodeGen/AArch64/zext.ll             |   81 +-
 llvm/test/CodeGen/ARM/addsubo-legalization.ll |    6 +-
 llvm/test/CodeGen/ARM/fpclamptosat_vec.ll     |   43 +-
 llvm/test/CodeGen/ARM/funnel-shift.ll         |    5 +-
 llvm/test/CodeGen/ARM/llvm.exp10.ll           |   12 +-
 .../CodeGen/ARM/load-combine-big-endian.ll    |   12 +-
 llvm/test/CodeGen/ARM/load-combine.ll         |    8 +-
 llvm/test/CodeGen/ARM/sub-cmp-peephole.ll     |   20 +-
 .../ARM/vecreduce-fadd-legalization-strict.ll |   34 +-
 llvm/test/CodeGen/ARM/vlddup.ll               |   30 +-
 llvm/test/CodeGen/ARM/vldlane.ll              |   31 +-
 llvm/test/CodeGen/RISCV/alu64.ll              |    6 +-
 llvm/test/CodeGen/RISCV/branch-on-zero.ll     |    6 +-
 llvm/test/CodeGen/RISCV/condops.ll            |   18 +-
 llvm/test/CodeGen/RISCV/double-fcmp-strict.ll |   36 +-
 llvm/test/CodeGen/RISCV/float-fcmp-strict.ll  |   18 +-
 llvm/test/CodeGen/RISCV/half-fcmp-strict.ll   |   18 +-
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         |   30 +-
 llvm/test/CodeGen/RISCV/machine-cp.mir        |    9 +-
 llvm/test/CodeGen/RISCV/neg-abs.ll            |   10 +-
 llvm/test/CodeGen/RISCV/nontemporal.ll        |  125 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |   13 +-
 llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll       |    8 +-
 .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll     | 2603 +++++++++++++++++
 .../RISCV/rv64-statepoint-call-lowering.ll    |    3 +-
 .../RISCV/rvv/constant-folding-crash.ll       |    6 +-
 .../rvv/fixed-vectors-deinterleave-load.ll    |    2 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   42 +-
 .../RISCV/rvv/fixed-vectors-fmaximum.ll       |   52 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   42 +-
 .../RISCV/rvv/fixed-vectors-fminimum.ll       |   52 +-
 .../rvv/fixed-vectors-masked-store-fp.ll      |    8 +-
 .../rvv/fixed-vectors-masked-store-int.ll     |    8 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |   24 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   24 +-
 .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll |   58 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |   52 +-
 .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll |   58 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |   52 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |    9 +-
 .../test/CodeGen/RISCV/rvv/mask-reg-alloc.mir |    2 +-
 .../CodeGen/RISCV/rvv/no-reserved-frame.ll    |    7 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |    2 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vmfeq.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmfge.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmfgt.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmfle.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmflt.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmfne.ll          |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmseq.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsge.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll         |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgt.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll         |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsle.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsleu.ll         |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmslt.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsltu.ll         |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vmsne.ll          |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |    2 +-
 .../CodeGen/RISCV/rvv/vsetvli-regression.ll   |    3 +-
 llvm/test/CodeGen/RISCV/rvv/vxrm.mir          |    2 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |    6 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |   31 +-
 llvm/test/CodeGen/RISCV/tail-calls.ll         |   14 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |    3 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |   37 +-
 llvm/test/CodeGen/RISCV/wide-mem.ll           |    3 +-
 ...lar-shift-by-byte-multiple-legalization.ll |    6 +-
 .../RISCV/wide-scalar-shift-legalization.ll   |    6 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |   81 +-
 llvm/test/CodeGen/RISCV/xtheadmemidx.ll       |    2 +-
 llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir   |   16 +-
 llvm/test/CodeGen/Thumb/smul_fix_sat.ll       |    6 +-
 .../Thumb/umulo-128-legalisation-lowering.ll  |    4 +-
 llvm/test/CodeGen/Thumb2/mve-div-expand.ll    |   17 +-
 llvm/test/CodeGen/Thumb2/mve-fmath.ll         |   95 +-
 .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll    |   32 +-
 .../CodeGen/Thumb2/mve-fptosi-sat-vector.ll   |   40 +-
 .../CodeGen/Thumb2/mve-fptoui-sat-vector.ll   |   42 +-
 llvm/test/CodeGen/Thumb2/mve-frint.ll         |   24 +-
 .../CodeGen/Thumb2/mve-laneinterleaving.ll    |    6 +-
 .../CodeGen/Thumb2/mve-sext-masked-load.ll    |    9 +-
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll       |   24 +-
 llvm/test/CodeGen/Thumb2/mve-shufflemov.ll    |   50 +-
 llvm/test/CodeGen/Thumb2/mve-simple-arith.ll  |   18 +-
 llvm/test/CodeGen/Thumb2/mve-vabdus.ll        |    6 +-
 llvm/test/CodeGen/Thumb2/mve-vcvt.ll          |    8 +-
 llvm/test/CodeGen/Thumb2/mve-vcvt16.ll        |    4 +-
 llvm/test/CodeGen/Thumb2/mve-vld4.ll          |    4 +-
 llvm/test/CodeGen/Thumb2/mve-vmovn.ll         |    4 +-
 llvm/test/CodeGen/Thumb2/mve-vst4.ll          |    4 +-
 .../CodeGen/Thumb2/mve-zext-masked-load.ll    |   10 +-
 llvm/test/CodeGen/X86/apx/mul-i1024.ll        |   19 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll     |    2 +-
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |    6 +-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |   12 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll  |   38 +-
 .../CodeGen/X86/avx512-gfni-intrinsics.ll     |   60 +-
 .../test/CodeGen/X86/avx512-insert-extract.ll |   14 +-
 llvm/test/CodeGen/X86/avx512-intrinsics.ll    |    6 +-
 llvm/test/CodeGen/X86/avx512-mask-op.ll       |   10 +-
 .../X86/avx512bw-intrinsics-upgrade.ll        |   20 +-
 llvm/test/CodeGen/X86/avx512bw-intrinsics.ll  |    6 +-
 .../X86/avx512bwvl-intrinsics-upgrade.ll      |   12 +-
 .../test/CodeGen/X86/avx512bwvl-intrinsics.ll |   30 +-
 .../X86/avx512vbmi2vl-intrinsics-upgrade.ll   |   84 +-
 .../CodeGen/X86/avx512vbmi2vl-intrinsics.ll   |   12 +-
 .../X86/avx512vl-intrinsics-upgrade.ll        |   12 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |    2 +-
 .../div-rem-pair-recomposition-unsigned.ll    |    2 +-
 .../element-wise-atomic-memory-intrinsics.ll  |    6 +-
 .../CodeGen/X86/expand-vp-cast-intrinsics.ll  |    3 +-
 llvm/test/CodeGen/X86/extract-bits.ll         |   33 +-
 llvm/test/CodeGen/X86/icmp-abs-C-vec.ll       |    6 +-
 llvm/test/CodeGen/X86/is_fpclass.ll           |    6 +-
 llvm/test/CodeGen/X86/ldexp.ll                |    3 +-
 llvm/test/CodeGen/X86/legalize-shl-vec.ll     |   16 +-
 llvm/test/CodeGen/X86/matrix-multiply.ll      |   60 +-
 llvm/test/CodeGen/X86/mul-i1024.ll            |    2 +-
 llvm/test/CodeGen/X86/mul-i256.ll             |    2 +-
 llvm/test/CodeGen/X86/mul-i512.ll             |    2 +-
 .../X86/peephole-na-phys-copy-folding.ll      |    2 +-
 llvm/test/CodeGen/X86/pmul.ll                 |    3 +-
 llvm/test/CodeGen/X86/pmulh.ll                |   12 +-
 llvm/test/CodeGen/X86/pointer-vector.ll       |    3 +-
 llvm/test/CodeGen/X86/pr11334.ll              |    3 +-
 llvm/test/CodeGen/X86/pr34177.ll              |    2 +-
 llvm/test/CodeGen/X86/pr61964.ll              |   10 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |    8 +-
 llvm/test/CodeGen/X86/sibcall.ll              |    3 +-
 llvm/test/CodeGen/X86/smul_fix.ll             |    2 +-
 llvm/test/CodeGen/X86/smul_fix_sat.ll         |    3 +-
 .../X86/smulo-128-legalisation-lowering.ll    |   12 +-
 .../subvectorwise-store-of-vector-splat.ll    |   30 +-
 llvm/test/CodeGen/X86/umul-with-overflow.ll   |    7 +-
 llvm/test/CodeGen/X86/umul_fix.ll             |    2 +-
 llvm/test/CodeGen/X86/umul_fix_sat.ll         |    6 +-
 .../X86/umulo-128-legalisation-lowering.ll    |    3 +-
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        |   15 +-
 llvm/test/CodeGen/X86/vec_saddo.ll            |   14 +-
 llvm/test/CodeGen/X86/vec_ssubo.ll            |    5 +-
 llvm/test/CodeGen/X86/vec_umulo.ll            |   29 +-
 llvm/test/CodeGen/X86/vector-interleave.ll    |    6 +-
 .../vector-interleaved-load-i16-stride-2.ll   |    4 +-
 .../vector-interleaved-load-i16-stride-3.ll   |   16 +-
 .../vector-interleaved-load-i16-stride-4.ll   |    4 +-
 .../vector-interleaved-load-i16-stride-5.ll   |   26 +-
 .../vector-interleaved-load-i16-stride-6.ll   |   20 +-
 .../vector-interleaved-load-i16-stride-7.ll   |   49 +-
 .../vector-interleaved-load-i16-stride-8.ll   |   64 +-
 .../vector-interleaved-load-i32-stride-3.ll   |   44 +-
 .../vector-interleaved-load-i32-stride-4.ll   |   40 +-
 .../vector-interleaved-load-i32-stride-5.ll   |  133 +-
 .../vector-interleaved-load-i32-stride-6.ll   |   51 +-
 .../vector-interleaved-load-i32-stride-7.ll   |   60 +-
 .../vector-interleaved-load-i32-stride-8.ll   |  170 +-
 .../vector-interleaved-load-i64-stride-4.ll   |   12 +-
 .../vector-interleaved-load-i64-stride-5.ll   |   40 +-
 .../vector-interleaved-load-i64-stride-6.ll   |   16 +-
 .../vector-interleaved-load-i64-stride-7.ll   |  164 +-
 .../vector-interleaved-load-i64-stride-8.ll   |  464 +--
 .../vector-interleaved-load-i8-stride-3.ll    |    6 +-
 .../vector-interleaved-load-i8-stride-4.ll    |   12 +-
 .../vector-interleaved-load-i8-stride-5.ll    |   10 +-
 .../vector-interleaved-load-i8-stride-6.ll    |   22 +-
 .../vector-interleaved-load-i8-stride-7.ll    |   82 +-
 .../vector-interleaved-load-i8-stride-8.ll    |   98 +-
 .../vector-interleaved-store-i16-stride-3.ll  |   64 +-
 .../vector-interleaved-store-i16-stride-4.ll  |   24 +-
 .../vector-interleaved-store-i16-stride-5.ll  |   30 +-
 .../vector-interleaved-store-i16-stride-6.ll  |   48 +-
 .../vector-interleaved-store-i16-stride-7.ll  |   81 +-
 .../vector-interleaved-store-i32-stride-2.ll  |   24 +-
 .../vector-interleaved-store-i32-stride-3.ll  |    4 +-
 .../vector-interleaved-store-i32-stride-5.ll  |   22 +-
 .../vector-interleaved-store-i32-stride-6.ll  |  116 +-
 .../vector-interleaved-store-i32-stride-7.ll  |  115 +-
 .../vector-interleaved-store-i32-stride-8.ll  |    4 +-
 .../vector-interleaved-store-i64-stride-3.ll  |    8 +-
 .../vector-interleaved-store-i64-stride-4.ll  |   48 +-
 .../vector-interleaved-store-i64-stride-5.ll  |   10 +-
 .../vector-interleaved-store-i64-stride-7.ll  |  282 +-
 .../vector-interleaved-store-i64-stride-8.ll  |   96 +-
 .../vector-interleaved-store-i8-stride-3.ll   |    2 +-
 .../vector-interleaved-store-i8-stride-5.ll   |    8 +-
 .../vector-interleaved-store-i8-stride-6.ll   |   38 +-
 .../vector-interleaved-store-i8-stride-7.ll   |   37 +-
 .../vector-interleaved-store-i8-stride-8.ll   |    8 +-
 llvm/test/CodeGen/X86/vector-intrinsics.ll    |    4 +-
 llvm/test/CodeGen/X86/vector-sext.ll          |   15 +-
 .../X86/vector-shuffle-combining-avx.ll       |   21 +-
 llvm/test/CodeGen/X86/vector-zext.ll          |    6 +-
 .../X86/wide-scalar-shift-legalization.ll     |    2 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |   10 +-
 llvm/test/CodeGen/X86/xmulo.ll                |  134 +-
 241 files changed, 6265 insertions(+), 4463 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index b34e0939d1c7c6..493d7cd7d8c920 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -48,19 +48,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -70,9 +78,15 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <cassert>
 #include <iterator>
+#include <optional>
+#include <queue>
+#include <utility>
+#include <variant>
 
 using namespace llvm;
 
@@ -92,6 +106,113 @@ static cl::opt<cl::boolOrDefault>
     EnableSpillageCopyElimination("enable-spill-copy-elim", cl::Hidden);
 
 namespace {
+// A ScheduleDAG subclass that is used as a dependency graph.
+class ScheduleDAGMCP : public ScheduleDAGInstrs {
+public:
+  void schedule() override {
+    llvm_unreachable("This schedule dag is only used as a dependency graph for "
+                     "Machine Copy Propagation\n");
+  }
+
+  ScheduleDAGMCP(MachineFunction &MF, const MachineLoopInfo *MLI,
+                 bool RemoveKillFlags = false)
+      : ScheduleDAGInstrs(MF, MLI, RemoveKillFlags) {
+    CanHandleTerminators = true;
+  }
+};
+
+static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
+                                               SUnit *Src,
+                                               ScheduleDAGMCP &DG) {
+  MachineInstr *DstInstr = Dst->getInstr();
+  MachineInstr *SrcInstr = Src->getInstr();
+  MachineBasicBlock *MBB = SrcInstr->getParent();
+
+  if (DstInstr == nullptr || SrcInstr == nullptr)
+    return false;
+  assert("This function only operates on a basic block level." &&
+         MBB == SrcInstr->getParent());
+
+  int SectionSize =
+      std::distance(SrcInstr->getIterator(), DstInstr->getIterator());
+
+  // The bit vector representing the instructions in the section.
+  // This vector stores which instruction needs to be moved and which does not.
+  BitVector SectionInstr(SectionSize, false);
+
+  // The queue for the breadth first search.
+  std::queue<const SUnit *> Edges;
+
+  // Process the children of a node.
+  // Basically every node are checked before it is being put into the queue.
+  // A node is enqueued if it has no dependencies on the source of the copy
+  // (only if we are not talking about the destination node which is a special
+  // case indicated by a flag) and is located between the source of the copy and
+  // the destination of the copy.
+  auto ProcessSNodeChildren = [SrcInstr, &SectionSize, &SectionInstr](
+                                  std::queue<const SUnit *> &Queue,
+                                  const SUnit *Node, bool IsRoot) -> bool {
+    for (llvm::SDep I : Node->Preds) {
+      SUnit *SU = I.getSUnit();
+      MachineInstr &MI = *(SU->getInstr());
+      if (!IsRoot && &MI == SrcInstr)
+        return false;
+
+      int DestinationFromSource =
+          std::distance(SrcInstr->getIterator(), MI.getIterator());
+
+      if (&MI != SrcInstr && DestinationFromSource > 0 &&
+          DestinationFromSource < SectionSize) {
+        // If an instruction is already in the Instructions to move map, than
+        // that means that it has already been processes with all of their
+        // dependence. We do not need to do anything with it again.
+        if (!SectionInstr[DestinationFromSource]) {
+          SectionInstr[DestinationFromSource] = true;
+          Queue.push(SU);
+        }
+      }
+    }
+    return true;
+  };
+
+  // The BFS happens here.
+  //
+  // Could not use the ADT implementation of BFS here.
+  // In ADT graph traversals we don't have the chance to select exactly which
+  // children are being put into the "nodes to traverse" queue or stack.
+  //
+  // We couldn't work around this by checking the need for the node in the
+  // processing stage. In some context it does matter what the parent of the
+  // instruction was: Namely when we are starting the traversal with the source
+  // of the copy propagation. This instruction must have the destination as a
+  // dependency. In case of other instruction than has the destination as a dependency, this
+  // dependency would mean the end of the traversal, but in this scenario this
+  // must be ignored. Let's say that we can not control what nodes to process
+  // and we come across the copy source. How do I know what node has that copy
+  // source as their dependency? We can check of which node is the copy source
+  // the dependency of. This list will alway contain the source. To decide if we
+  // have it as dependency of another instruction, we must check in the already
+  // traversed list if any of the instructions that is depended on the source is
+  // contained. This would introduce extra costs.
+  ProcessSNodeChildren(Edges, Dst, true);
+  while (!Edges.empty()) {
+    const auto *Current = Edges.front();
+    Edges.pop();
+    if (!ProcessSNodeChildren(Edges, Current, false))
+      return false;
+  }
+
+  // If all of the dependencies were deemed valid during the BFS then we
+  // are moving them before the copy source here keeping their relative
+  // order to each other.
+  auto CurrentInst = SrcInstr->getIterator();
+  for (int I = 0; I < SectionSize; I++) {
+    if (SectionInstr[I])
+      MBB->splice(SrcInstr->getIterator(), MBB, CurrentInst->getIterator());
+    ++CurrentInst;
+  }
+  return true;
+}
 
 static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
                                                  const TargetInstrInfo &TII,
@@ -114,6 +235,7 @@ class CopyTracker {
   };
 
   DenseMap<MCRegUnit, CopyInfo> Copies;
+  DenseMap<MCRegUnit, CopyInfo> InvalidCopies;
 
 public:
   /// Mark all of the given registers and their subregisters as unavailable for
@@ -130,9 +252,14 @@ class CopyTracker {
     }
   }
 
+  int getInvalidCopiesSize() {
+    return InvalidCopies.size();
+  }
+
   /// Remove register from copy maps.
   void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                          const TargetInstrInfo &TII, bool UseCopyInstr) {
+                          const TargetInstrInfo &TII, bool UseCopyInstr,
+                          bool MayStillBePropagated = false) {
     // Since Reg might be a subreg of some registers, only invalidate Reg is not
     // enough. We have to find the COPY defines Reg or registers defined by Reg
     // and invalidate all of them. Similarly, we must invalidate all of the
@@ -158,8 +285,11 @@ class CopyTracker {
           InvalidateCopy(MI);
       }
     }
-    for (MCRegUnit Unit : RegUnitsToInvalidate)
+    for (MCRegUnit Unit : RegUnitsToInvalidate) {
+      if (Copies.contains(Unit) && MayStillBePropagated)
+        InvalidCopies[Unit] = Copies[Unit];
       Copies.erase(Unit);
+    }
   }
 
   /// Clobber a single register, removing it from the tracker's copy maps.
@@ -252,6 +382,10 @@ class CopyTracker {
     return !Copies.empty();
   }
 
+  bool hasAnyInvalidCopies() {
+    return !InvalidCopies.empty();
+  }
+
   MachineInstr *findCopyForUnit(MCRegUnit RegUnit,
                                 const TargetRegisterInfo &TRI,
                                 bool MustBeAvailable = false) {
@@ -263,6 +397,17 @@ class CopyTracker {
     return CI->second.MI;
   }
 
+  MachineInstr *findInvalidCopyForUnit(MCRegUnit RegUnit,
+                                const TargetRegisterInfo &TRI,
+                                bool MustBeAvailable = false) {
+    auto CI = InvalidCopies.find(RegUnit);
+    if (CI == InvalidCopies.end())
+      return nullptr;
+    if (MustBeAvailable && !CI->second.Avail)
+      return nullptr;
+    return CI->second.MI;
+  }
+
   MachineInstr *findCopyDefViaUnit(MCRegUnit RegUnit,
                                    const TargetRegisterInfo &TRI) {
     auto CI = Copies.find(RegUnit);
@@ -274,12 +419,28 @@ class CopyTracker {
     return findCopyForUnit(RU, TRI, true);
   }
 
+  MachineInstr *findInvalidCopyDefViaUnit(MCRegUnit RegUnit,
+                                   const TargetRegisterInfo &TRI) {
+    auto CI = InvalidCopies.find(RegUnit);
+    if (CI == InvalidCopies.end())
+      return nullptr;
+    if (CI->second.DefRegs.size() != 1)
+      return nullptr;
+    MCRegUnit RU = *TRI.regunits(CI->second.DefRegs[0]).begin();
+    return findInvalidCopyForUnit(RU, TRI, false);
+  }
+
+  // TODO: This is ugly there shall be a more elegant solution to invalid
+  //       copy searching. Create a variant that either returns a valid an invalid
+  //       copy or no copy at all (std::monotype).
   MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg,
                                       const TargetRegisterInfo &TRI,
                                       const TargetInstrInfo &TII,
-                                      bool UseCopyInstr) {
+                                      bool UseCopyInstr,
+                                      bool SearchInvalid = false) {
     MCRegUnit RU = *TRI.regunits(Reg).begin();
-    MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI);
+    MachineInstr *AvailCopy = SearchInvalid ? findInvalidCopyDefViaUnit(RU, TRI)
+                                            : findCopyDefViaUnit(RU, TRI);
 
     if (!AvailCopy)
       return nullptr;
@@ -377,13 +538,20 @@ class CopyTracker {
 
   void clear() {
     Copies.clear();
+    InvalidCopies.clear();
   }
 };
 
+using Copy = MachineInstr*;
+using InvalidCopy = std::pair<Copy, MachineInstr *>;
+using CopyLookupResult = std::variant<std::monostate, Copy, InvalidCopy>;
+
 class MachineCopyPropagation : public MachineFunctionPass {
+  LiveIntervals *LIS = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
+  AAResults *AA = nullptr;
 
   // Return true if this is a copy instruction and false otherwise.
   bool UseCopyInstr;
@@ -398,6 +566,7 @@ class MachineCopyPropagation : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -414,11 +583,11 @@ class MachineCopyPropagation : public MachineFunctionPass {
   void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT);
   void readSuccessorLiveIns(const MachineBasicBlock &MBB);
   void ForwardCopyPropagateBlock(MachineBasicBlock &MBB);
-  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB);
+  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, bool ResolveAntiDeps = false);
   void EliminateSpillageCopies(MachineBasicBlock &MBB);
   bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def);
   void forwardUses(MachineInstr &MI);
-  void propagateDefs(MachineInstr &MI);
+  void propagateDefs(MachineInstr &MI, ScheduleDAGMCP &DG, bool ResolveAntiDeps = false);
   bool isForwardableRegClassCopy(const MachineInstr &Copy,
                                  const MachineInstr &UseI, unsigned UseIdx);
   bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy,
@@ -427,7 +596,7 @@ class MachineCopyPropagation : public MachineFunctionPass {
   bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
   bool hasOverlappingMultipleDef(const MachineInstr &MI,
                                  const MachineOperand &MODef, Register Def);
-
+  
   /// Candidates for deletion.
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
 
@@ -986,8 +1155,10 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands,
   return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill();
 }
 
-void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
-  if (!Tracker.hasAnyCopies())
+void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
+                                           ScheduleDAGMCP &DG,
+                                           bool MoveDependenciesForBetterCopyPropagation) {
+  if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies())
     return;
 
   for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
@@ -1010,8 +1181,30 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
 
     MachineInstr *Copy = Tracker.findAvailBackwardCopy(
         MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr);
-    if (!Copy)
-      continue;
+    if (!Copy) {
+      if (!MoveDependenciesForBetterCopyPropagation)
+        continue;
+
+      LLVM_DEBUG(
+          dbgs()
+          << "MCP: Couldn't find any backward copy that has no dependency.\n");
+      Copy = Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI,
+                                           *TII, UseCopyInstr, true);
+      if (!Copy) {
+        LLVM_DEBUG(
+            dbgs()
+            << "MCP: Couldn't find any backward copy that has dependency.\n");
+        continue;
+      }
+      LLVM_DEBUG(
+          dbgs()
+          << "MCP: Found potential backward copy that has dependency.\n");
+      SUnit *DstSUnit = DG.getSUnit(Copy);
+      SUnit *SrcSUnit = DG.getSUnit(&MI);
+
+      if (!moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG))
+        continue;
+    }
 
     std::optional<DestSourcePair> CopyOperands =
         isCopyInstr(*Copy, *TII, UseCopyInstr);
@@ -1033,23 +1226,35 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
+    if (!MoveDependenciesForBetterCopyPropagation) {
+      MODef.setReg(Def);
+      MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
-    MODef.setReg(Def);
-    MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
-
-    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
-    MaybeDeadCopies.insert(Copy);
-    Changed = true;
-    ++NumCopyBackwardPropagated;
+      LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+      MaybeDeadCopies.insert(Copy);
+      Changed = true;
+      ++NumCopyBackwardPropagated;
+    }
   }
 }
 
 void MachineCopyPropagation::BackwardCopyPropagateBlock(
-    MachineBasicBlock &MBB) {
+    MachineBasicBlock &MBB, bool MoveDependenciesForBetterCopyPropagation) {
+  ScheduleDAGMCP DG{*(MBB.getParent()), nullptr, false};
+  if (MoveDependenciesForBetterCopyPropagation) {
+    DG.startBlock(&MBB);
+    DG.enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size());
+    DG.buildSchedGraph(nullptr);
+    // DG.viewGraph();
+  }
+ 
+
   LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
                     << "\n");
 
   for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
+    //llvm::errs() << "Next MI: ";
+    //MI.dump();
     // Ignore non-trivial COPYs.
     std::optional<DestSourcePair> CopyOperands =
         isCopyInstr(MI, *TII, UseCopyInstr);
@@ -1062,7 +1267,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         // just let forward cp do COPY-to-COPY propagation.
         if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) {
           Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII,
-                                     UseCopyInstr);
+                                     UseCopyInstr, MoveDependenciesForBetterCopyPropagation);
           Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII,
                                      UseCopyInstr);
           Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
@@ -1077,10 +1282,10 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
-        Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr);
+        Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr, false);
       }
 
-    propagateDefs(MI);
+    propagateDefs(MI, DG, MoveDependenciesForBetterCopyPropagation);
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
@@ -1104,7 +1309,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
           }
         } else {
           Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
-                                     UseCopyInstr);
+                                     UseCopyInstr, MoveDependenciesForBetterCopyPropagation);
         }
       }
     }
@@ -1122,6 +1327,15 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     Copy->eraseFromParent();
     ++NumDeletes;
   }
+  if (MoveDependenciesForBetterCopyPropagation) {
+    DG.exitRegion();
+    DG.finishBlock();
+    // QUESTION: Does it makes sense to keep the kill flags here?
+    //           On the other parts of this pass we juts throw out
+    //           the kill flags.
+    DG.fixupKills(MBB);
+  }
+
 
   MaybeDeadCopies.clear();
   CopyDbgUsers.clear();
@@ -1472,11 +1686,29 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
+  auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
+  LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
 
   for (MachineBasicBlock &MBB : MF) {
     if (isSpillageCopyElimEnabled)
       EliminateSpillageCopies(MBB);
+
+    // BackwardCopyPropagateBlock happens in two stages.
+    // First we move those unnecessary dependencies out of the way
+    // that may block copy propagations.
+    //
+    // The reason for this two stage approach is that the ScheduleDAG can not
+    // handle register renaming.
+    // QUESTION: I think these two stages could be merged together, if I were to change
+    // the renaming mechanism.
+    //
+    // The renaming wouldn't happen instantly. There would be a data structure
+    // that contained what register should be renamed to what. Then after the
+    // backward propagation has concluded the renaming would happen.
+    BackwardCopyPropagateBlock(MBB, true);
+    // Then we do the actual copy propagation.
     BackwardCopyPropagateBlock(MBB);
+
     ForwardCopyPropagateBlock(MBB);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index de3f323891a36a..92575d701f4281 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -6026,8 +6026,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6133,8 +6133,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6238,8 +6238,8 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6336,8 +6336,8 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6434,8 +6434,8 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index c6819ff39ed33e..a1377d6f89d67e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -33,8 +33,8 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT: bb.4.cmpxchg.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acquire acquire, !pcsections !0
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -73,8 +73,8 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) {
   ; CHECK-NEXT: bb.4.cmpxchg.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %new = load i32, ptr %pnew, !pcsections !0
   %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acquire acquire, !pcsections !0
   %val = extractvalue { i32, i1 } %pair, 0
@@ -112,8 +112,8 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT: bb.4.cmpxchg.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acq_rel monotonic, !pcsections !0
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -151,7 +151,7 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) {
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x0 = ORRXrs $xzr, killed $x8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %pair = cmpxchg ptr %p, i64 %cmp, i64 %new monotonic monotonic, !pcsections !0
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
@@ -189,7 +189,7 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new)
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x0 = ORRXrs $xzr, killed $x8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %pair = cmpxchg ptr %p, i64 %cmp, i64 %new monotonic seq_cst, !pcsections !0
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
@@ -227,7 +227,7 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new)
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x0 = ORRXrs $xzr, killed $x8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %pair = cmpxchg ptr %p, i64 %cmp, i64 %new release acquire, !pcsections !0
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
@@ -252,8 +252,8 @@ define i32 @fetch_and_nand(ptr %p) {
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %val = atomicrmw nand ptr %p, i32 7 release, !pcsections !0
   ret i32 %val
 }
@@ -278,7 +278,7 @@ define i64 @fetch_and_nand_64(ptr %p) {
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x0 = ORRXrs $xzr, killed $x8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %val = atomicrmw nand ptr %p, i64 7 acq_rel, !pcsections !0
   ret i64 %val
 }
@@ -303,8 +303,8 @@ define i32 @fetch_and_or(ptr %p) {
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %val = atomicrmw or ptr %p, i32 5 seq_cst, !pcsections !0
   ret i32 %val
 }
@@ -328,7 +328,7 @@ define i64 @fetch_and_or_64(ptr %p) {
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x0 = ORRXrs $xzr, killed $x8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %val = atomicrmw or ptr %p, i64 7 monotonic, !pcsections !0
   ret i64 %val
 }
@@ -366,7 +366,7 @@ define i32 @atomic_load(ptr %p) {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p)
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
    %r = load atomic i32, ptr %p seq_cst, align 4, !pcsections !0
    ret i32 %r
 }
@@ -384,7 +384,7 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 0, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = LDRBBui killed renamable $x9, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random)
   ; CHECK-NEXT:   renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 0, pcsections !0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %ptr_unsigned = getelementptr i8, ptr %p, i32 4095
   %val_unsigned = load atomic i8, ptr %ptr_unsigned monotonic, align 1, !pcsections !0
 
@@ -416,7 +416,7 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 8, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = LDRHHui killed renamable $x9, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random)
   ; CHECK-NEXT:   renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 8, pcsections !0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %ptr_unsigned = getelementptr i16, ptr %p, i32 4095
   %val_unsigned = load atomic i16, ptr %ptr_unsigned monotonic, align 2, !pcsections !0
 
@@ -448,7 +448,7 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x11, 0, pcsections !0 :: (load unordered (s32) from %ir.ptr_random)
   ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
   ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %ptr_unsigned = getelementptr i32, ptr %p, i32 4095
   %val_unsigned = load atomic i32, ptr %ptr_unsigned monotonic, align 4, !pcsections !0
 
@@ -480,7 +480,7 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) {
   ; CHECK-NEXT:   renamable $x9 = LDRXui killed renamable $x11, 0, pcsections !0 :: (load unordered (s64) from %ir.ptr_random)
   ; CHECK-NEXT:   $x8 = ADDXrs killed renamable $x8, killed renamable $x10, 0, pcsections !0
   ; CHECK-NEXT:   $x0 = ADDXrs killed renamable $x8, killed renamable $x9, 0, pcsections !0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   %ptr_unsigned = getelementptr i64, ptr %p, i32 4095
   %val_unsigned = load atomic i64, ptr %ptr_unsigned monotonic, align 8, !pcsections !0
 
@@ -624,7 +624,7 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
   ; CHECK-NOLSE-NEXT:   renamable $w8 = LDARB killed renamable $x0, pcsections !0 :: (load acquire (s8) from %ir.p8)
   ; CHECK-NOLSE-NEXT:   renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16)
   ; CHECK-NOLSE-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0
-  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit killed $w0
   ;
   ; CHECK-LDAPR-LABEL: name: load_zext
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
@@ -633,7 +633,7 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
   ; CHECK-LDAPR-NEXT:   renamable $w8 = LDAPRB killed renamable $x0, pcsections !0 :: (load acquire (s8) from %ir.p8)
   ; CHECK-LDAPR-NEXT:   renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16)
   ; CHECK-LDAPR-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0
-  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit killed $w0
   %val1.8 = load atomic i8, ptr %p8 acquire, align 1, !pcsections !0
   %val1 = zext i8 %val1.8 to i32
 
@@ -651,7 +651,7 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) {
   ; CHECK-NOLSE-NEXT: {{  $}}
   ; CHECK-NOLSE-NEXT:   renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p32)
   ; CHECK-NOLSE-NEXT:   renamable $x1 = LDARX killed renamable $x1, pcsections !0 :: (load acquire (s64) from %ir.p64)
-  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0, implicit $x1
+  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $x1
   ;
   ; CHECK-LDAPR-LABEL: name: load_acq
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
@@ -659,7 +659,7 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) {
   ; CHECK-LDAPR-NEXT: {{  $}}
   ; CHECK-LDAPR-NEXT:   renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p32)
   ; CHECK-LDAPR-NEXT:   renamable $x1 = LDAPRX killed renamable $x1, pcsections !0 :: (load acquire (s64) from %ir.p64)
-  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit $w0, implicit $x1
+  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $x1
   %val32 = load atomic i32, ptr %p32 seq_cst, align 4, !pcsections !0
   %tmp = insertvalue { i32, i64 } undef, i32 %val32, 0
 
@@ -678,7 +678,7 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
   ; CHECK-NOLSE-NEXT:   renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16)
   ; CHECK-NOLSE-NEXT:   renamable $w9 = SBFMWri killed renamable $w9, 0, 15
   ; CHECK-NOLSE-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 32, pcsections !0
-  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit killed $w0
   ;
   ; CHECK-LDAPR-LABEL: name: load_sext
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
@@ -688,7 +688,7 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
   ; CHECK-LDAPR-NEXT:   renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16)
   ; CHECK-LDAPR-NEXT:   renamable $w9 = SBFMWri killed renamable $w9, 0, 15
   ; CHECK-LDAPR-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 32, pcsections !0
-  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-LDAPR-NEXT:   RET undef $lr, implicit killed $w0
   %val1.8 = load atomic i8, ptr %p8 acquire, align 1, !pcsections !0
   %val1 = sext i8 %val1.8 to i32
 
@@ -728,14 +728,14 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw add ptr %ptr, i8 %rhs seq_cst, !pcsections !0
   ret i8 %res
 }
@@ -746,7 +746,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w1 = KILL $w1, implicit-def $x1
+  ; CHECK-NEXT:   renamable $w1 = KILL killed $w1, implicit-def $x1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@@ -759,8 +759,8 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw xchg ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
 }
@@ -777,14 +777,14 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw sub ptr %ptr, i8 %rhs acquire, !pcsections !0
   ret i8 %res
 }
@@ -801,14 +801,14 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw and ptr %ptr, i8 %rhs release, !pcsections !0
   ret i8 %res
 }
@@ -825,14 +825,14 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw or ptr %ptr, i8 %rhs seq_cst, !pcsections !0
   ret i8 %res
 }
@@ -849,14 +849,14 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw xor ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
 }
@@ -875,14 +875,14 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw min ptr %ptr, i8 %rhs acquire, !pcsections !0
   ret i8 %res
 }
@@ -901,14 +901,14 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw max ptr %ptr, i8 %rhs release, !pcsections !0
   ret i8 %res
 }
@@ -926,17 +926,17 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   renamable $w8 = ANDWri killed renamable $w8, 7, implicit $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw umin ptr %ptr, i8 %rhs seq_cst, !pcsections !0
   ret i8 %res
 }
@@ -954,17 +954,17 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   renamable $w8 = ANDWri killed renamable $w8, 7, implicit $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw umax ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
 }
@@ -981,14 +981,14 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw add ptr %ptr, i16 %rhs seq_cst, !pcsections !0
   ret i16 %res
 }
@@ -999,7 +999,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w1 = KILL $w1, implicit-def $x1
+  ; CHECK-NEXT:   renamable $w1 = KILL killed $w1, implicit-def $x1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
@@ -1012,8 +1012,8 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw xchg ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
 }
@@ -1030,14 +1030,14 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw sub ptr %ptr, i16 %rhs acquire, !pcsections !0
   ret i16 %res
 }
@@ -1054,14 +1054,14 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw and ptr %ptr, i16 %rhs release, !pcsections !0
   ret i16 %res
 }
@@ -1078,14 +1078,14 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw or ptr %ptr, i16 %rhs seq_cst, !pcsections !0
   ret i16 %res
 }
@@ -1102,14 +1102,14 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw xor ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
 }
@@ -1128,14 +1128,14 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw min ptr %ptr, i16 %rhs acquire, !pcsections !0
   ret i16 %res
 }
@@ -1154,14 +1154,14 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw max ptr %ptr, i16 %rhs release, !pcsections !0
   ret i16 %res
 }
@@ -1179,17 +1179,17 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   renamable $w8 = ANDWri killed renamable $w8, 15, implicit $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw umin ptr %ptr, i16 %rhs seq_cst, !pcsections !0
   ret i16 %res
 }
@@ -1207,17 +1207,17 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   renamable $w8 = ANDWri killed renamable $w8, 15, implicit $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
-  ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
   ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
   %res = atomicrmw umax ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
 }
@@ -1228,8 +1228,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
-  ; CHECK-NEXT:   renamable $w2 = KILL $w2, implicit-def $x2
+  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, killed $x0, 0
+  ; CHECK-NEXT:   renamable $w2 = KILL killed $w2, implicit-def $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
@@ -1251,16 +1251,16 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w1 = MOVZWi 1, 0
-  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0, implicit $w1
+  ; CHECK-NEXT:   $w0 = KILL killed renamable $w0, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $w1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.cmpxchg.nostore:
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w1 = ORRWrs $wzr, $wzr, 0
   ; CHECK-NEXT:   CLREX 15, pcsections !0
-  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0, implicit $w1
+  ; CHECK-NEXT:   $w0 = KILL killed renamable $w0, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $w1
   %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic, !pcsections !0
   ret { i8, i1 } %res
 }
@@ -1271,8 +1271,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
-  ; CHECK-NEXT:   renamable $w2 = KILL $w2, implicit-def $x2
+  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, killed $x0, 0
+  ; CHECK-NEXT:   renamable $w2 = KILL killed $w2, implicit-def $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
@@ -1294,16 +1294,16 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w1 = MOVZWi 1, 0
-  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0, implicit $w1
+  ; CHECK-NEXT:   $w0 = KILL killed renamable $w0, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $w1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.cmpxchg.nostore:
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $w1 = ORRWrs $wzr, $wzr, 0
   ; CHECK-NEXT:   CLREX 15, pcsections !0
-  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
-  ; CHECK-NEXT:   RET undef $lr, implicit $w0, implicit $w1
+  ; CHECK-NEXT:   $w0 = KILL killed renamable $w0, implicit $x0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0, implicit killed $w1
   %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic, !pcsections !0
   ret { i16, i1 } %res
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
index 7fd71b26fa1ba7..c8f8361e5ef885 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
@@ -256,9 +256,8 @@ define dso_local i32 @load_between_stores(i32 %x, ptr %p, ptr %ptr) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    strh w0, [x1]
 ; CHECK-NEXT:    lsr w9, w0, #16
-; CHECK-NEXT:    ldr w8, [x2]
+; CHECK-NEXT:    ldr w0, [x2]
 ; CHECK-NEXT:    strh w9, [x1, #2]
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %t1 = trunc i32 %x to i16
   %sh = lshr i32 %x, 16
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index e11ae9a2515900..63cfa19ab3d492 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -510,10 +510,9 @@ define i128 @mulv_v2i128(<2 x i128> %a) {
 ; CHECK-GI-LABEL: mulv_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mul x9, x0, x3
-; CHECK-GI-NEXT:    mul x8, x0, x2
 ; CHECK-GI-NEXT:    umulh x10, x0, x2
 ; CHECK-GI-NEXT:    madd x9, x1, x2, x9
-; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mul x0, x0, x2
 ; CHECK-GI-NEXT:    add x1, x9, x10
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index 410c2d9021d6d5..a150a0f6ee40a2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -131,13 +131,12 @@ entry:
 define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
 ; CHECK-SD-LABEL: mla_i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull2 v7.8h, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    umull v6.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    uaddw2 v5.4s, v5.4s, v7.8h
+; CHECK-SD-NEXT:    umull2 v7.8h, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    uaddw v0.4s, v2.4s, v6.4h
 ; CHECK-SD-NEXT:    uaddw2 v1.4s, v3.4s, v6.8h
+; CHECK-SD-NEXT:    uaddw2 v3.4s, v5.4s, v7.8h
 ; CHECK-SD-NEXT:    uaddw v2.4s, v4.4s, v7.4h
-; CHECK-SD-NEXT:    mov v3.16b, v5.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: mla_i32:
@@ -170,18 +169,17 @@ define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
 ; CHECK-SD-NEXT:    umull2 v0.8h, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ldp q20, q21, [sp]
 ; CHECK-SD-NEXT:    ushll v17.4s, v16.4h, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v16.4s, v16.8h, #0
 ; CHECK-SD-NEXT:    ushll2 v19.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v18.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    uaddw2 v1.2d, v3.2d, v17.4s
 ; CHECK-SD-NEXT:    uaddw v0.2d, v2.2d, v17.2s
 ; CHECK-SD-NEXT:    uaddw2 v3.2d, v5.2d, v16.4s
 ; CHECK-SD-NEXT:    uaddw v2.2d, v4.2d, v16.2s
-; CHECK-SD-NEXT:    uaddw2 v16.2d, v21.2d, v19.4s
 ; CHECK-SD-NEXT:    uaddw v4.2d, v6.2d, v18.2s
 ; CHECK-SD-NEXT:    uaddw2 v5.2d, v7.2d, v18.4s
+; CHECK-SD-NEXT:    uaddw2 v7.2d, v21.2d, v19.4s
 ; CHECK-SD-NEXT:    uaddw v6.2d, v20.2d, v19.2s
-; CHECK-SD-NEXT:    mov v7.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: mla_i64:
diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
index fb96d11acc275a..8a9cca866ff05d 100644
--- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
@@ -63,9 +63,8 @@ define <16 x i8> @deinterleave_shuffle_v32i8(<32 x i8> %a) {
 define <4 x i64> @deinterleave_shuffle_v8i64(<8 x i64> %a) {
 ; CHECK-LABEL: deinterleave_shuffle_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addp v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    addp v1.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    ret
   %r0 = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %r1 = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -123,9 +122,8 @@ define <8 x half> @deinterleave_shuffle_v16f16(<16 x half> %a) {
 define <4 x double> @deinterleave_shuffle_v8f64(<8 x double> %a) {
 ; CHECK-LABEL: deinterleave_shuffle_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddp v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    faddp v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    faddp v1.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    ret
   %r0 = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %r1 = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
new file mode 100644
index 00000000000000..c3a59990ccd255
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
@@ -0,0 +1,201 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -run-pass machine-cp  -verify-machineinstrs  -o - %s | FileCheck %s
+--- |
+  source_filename = "llvmirrepoW.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+
+  declare dso_local i32 @chain(i32 noundef, i32 noundef) local_unnamed_addr
+
+  declare dso_local void @init_var(ptr noundef) local_unnamed_addr
+
+  define dso_local void @fun2(i64 %a, i64 %b) local_unnamed_addr {
+  entry:
+    %c = alloca i32, align 4
+    ret void
+  }
+  define dso_local void @blocker(i64 %a, i64 %b) local_unnamed_addr {
+  entry:
+    %c = alloca i32, align 4
+    ret void
+  }
+
+...
+---
+name:            fun2
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+  - { reg: '$x1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  4
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: c, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: fun2
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0
+    ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1
+    ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: renamable $w1 = COPY killed $w0
+    ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: RET_ReallyLR
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $w0 = KILL renamable $w0, implicit killed $x0
+    $w1 = KILL renamable $w1, implicit killed $x1
+    BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
+    renamable $w1 = COPY $w0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $w0 = COPY killed renamable $w8
+    BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    RET_ReallyLR
+
+...
+
+---
+name:            blocker
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+  - { reg: '$x1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  4
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: c, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: blocker
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0
+    ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1
+    ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
+    ; CHECK-NEXT: renamable $w1 = COPY $w0
+    ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: RET_ReallyLR
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $w0 = KILL renamable $w0, implicit killed $x0
+    $w1 = KILL renamable $w1, implicit killed $x1
+    BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
+    renamable $w8 = ADDWrr $w0, $w0
+    renamable $w1 = COPY $w0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $w0 = COPY killed renamable $w8
+    BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
index c2ec0502d83bd7..cd821675bae6e1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
@@ -28,9 +28,8 @@ define i56 @ldi56(ptr %p) nounwind {
 define i80 @ldi80(ptr %p) nounwind {
 ; CHECK-LABEL: ldi80:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldrh w1, [x0, #8]
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ldr x0, [x0]
 ; CHECK-NEXT:    ret
     %r = load i80, ptr %p
     ret i80 %r
@@ -55,8 +54,9 @@ define i280 @ldi280(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w9, [x0, #34]
 ; CHECK-NEXT:    ldrh w10, [x0, #32]
-; CHECK-NEXT:    ldp x8, x1, [x0]
-; CHECK-NEXT:    ldp x2, x3, [x0, #16]
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldp x1, x2, [x0, #8]
+; CHECK-NEXT:    ldr x3, [x0, #24]
 ; CHECK-NEXT:    orr x4, x10, x9, lsl #16
 ; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 1f5654d59926dc..4f067ab1d48c1f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -31,35 +31,21 @@ define <8 x i16> @func2(<8 x i8> %v0) nounwind {
 }
 
 define <16 x i16> @func3(<16 x i8> %v0) nounwind {
-; CHECK-SD-LABEL: func3:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ushll2.8h v1, v0, #0
-; CHECK-SD-NEXT:    ushll.8h v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: func3:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.8h v2, v0, #0
-; CHECK-GI-NEXT:    ushll2.8h v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: func3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2.8h v1, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ret
   %r = zext <16 x i8> %v0 to <16 x i16>
   ret <16 x i16> %r
 }
 
 define <16 x i16> @func4(<16 x i8> %v0) nounwind {
-; CHECK-SD-LABEL: func4:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    sshll2.8h v1, v0, #0
-; CHECK-SD-NEXT:    sshll.8h v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: func4:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll.8h v2, v0, #0
-; CHECK-GI-NEXT:    sshll2.8h v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: func4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll2.8h v1, v0, #0
+; CHECK-NEXT:    sshll.8h v0, v0, #0
+; CHECK-NEXT:    ret
   %r = sext <16 x i8> %v0 to <16 x i16>
   ret <16 x i16> %r
 }
@@ -87,35 +73,21 @@ define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
 }
 
 define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
-; CHECK-SD-LABEL: afunc3:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ushll2.4s v1, v0, #0
-; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: afunc3:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.4s v2, v0, #0
-; CHECK-GI-NEXT:    ushll2.4s v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: afunc3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2.4s v1, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ret
   %r = zext <8 x i16> %v0 to <8 x i32>
   ret <8 x i32> %r
 }
 
 define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
-; CHECK-SD-LABEL: afunc4:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    sshll2.4s v1, v0, #0
-; CHECK-SD-NEXT:    sshll.4s v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: afunc4:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll.4s v2, v0, #0
-; CHECK-GI-NEXT:    sshll2.4s v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: afunc4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll2.4s v1, v0, #0
+; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    ret
   %r = sext <8 x i16> %v0 to <8 x i32>
   ret <8 x i32> %r
 }
@@ -161,35 +133,21 @@ define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
 ;-----
 
 define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
-; CHECK-SD-LABEL: zfunc1:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-SD-NEXT:    ushll.2d v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zfunc1:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.2d v2, v0, #0
-; CHECK-GI-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zfunc1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ret
   %r = zext <4 x i32> %v0 to <4 x i64>
   ret <4 x i64> %r
 }
 
 define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
-; CHECK-SD-LABEL: zfunc2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    sshll2.2d v1, v0, #0
-; CHECK-SD-NEXT:    sshll.2d v0, v0, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zfunc2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll.2d v2, v0, #0
-; CHECK-GI-NEXT:    sshll2.2d v1, v0, #0
-; CHECK-GI-NEXT:    mov.16b v0, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zfunc2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll2.2d v1, v0, #0
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    ret
   %r = sext <4 x i32> %v0 to <4 x i64>
   ret <4 x i64> %r
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
index bf559da91901cf..e13dde101271ae 100644
--- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll
@@ -130,8 +130,7 @@ define dso_local void @copy_notcxx14aggregate(ptr inreg noalias sret(%struct.Not
 define dso_local [2 x i64] @copy_notpod(ptr %x) {
 ; CHECK-LABEL: copy_notpod:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x1, [x0]
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ldp x0, x1, [x0]
 ; CHECK-NEXT:    ret
   %x2 = load [2 x i64], ptr %x
   ret [2 x i64] %x2
diff --git a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
index b940734c6988cb..bc2d715995b901 100644
--- a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # Check that we can remove the redundant save of constant registers such as $wzr
 # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -start-before=machine-cp -o - | FileCheck %s --check-prefix ASM
 # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -run-pass=machine-cp -o - | FileCheck %s
@@ -38,3 +39,5 @@ body: |
     $w0 = COPY killed renamable $w19
     RET_ReallyLR implicit $w0
 ...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee6..c8b73625086644 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -40,9 +40,8 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    mov w9, #42 // =0x2a
 ; CHECK-NEXT:    cmp w8, #42
 ; CHECK-NEXT:    sub w9, w9, w0
-; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    strb w9, [x1]
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = sub i8 42, %x
   %ov = icmp ugt i8 %x, 42
@@ -59,9 +58,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    mov w9, #43 // =0x2b
 ; CHECK-NEXT:    cmp w8, #43
 ; CHECK-NEXT:    sub w9, w9, w0
-; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    strh w9, [x1]
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = sub i16 43, %x
   %ov = icmp ult i16 43, %x
@@ -78,8 +76,7 @@ define i1 @usubo_ult_constant_op1_i16(i16 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    sub w9, w0, #44
 ; CHECK-NEXT:    cmp w8, #44
 ; CHECK-NEXT:    strh w9, [x1]
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %s = add i16 %x, -44
   %ov = icmp ult i16 %x, 44
@@ -94,8 +91,7 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    sub w9, w0, #45
 ; CHECK-NEXT:    cmp w8, #45
 ; CHECK-NEXT:    strb w9, [x1]
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %ov = icmp ugt i8 45, %x
   %s = add i8 %x, -45
@@ -110,9 +106,8 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    sub w9, w0, #1
-; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    str w9, [x1]
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = add i32 %x, -1
   %ov = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index 186d191444feb6..4558d7c464fe32 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -31,10 +31,10 @@ define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w19, w1
-; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
@@ -81,10 +81,10 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w19, w1
-; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas1_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w19, uxtb
@@ -126,10 +126,10 @@ define void @test_conditional(ptr %p, i32 %oldval, i32 %newval) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w19, w1
-; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w19
diff --git a/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll
index 420cf93a023d70..88dedb720f3975 100644
--- a/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll
+++ b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll
@@ -4,9 +4,8 @@
 define <8 x half> @sel.v8f16.fmul(ptr %p, ptr %q, <8 x half> %a, <8 x half> %b, <4 x half> %c) {
 ; CHECK-LABEL: sel.v8f16.fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v1.8h, v1.8h, v0.h[0]
 ; CHECK-NEXT:    fmul v2.4h, v2.4h, v0.h[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v0.8h, v1.8h, v0.h[0]
 ; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer
@@ -21,9 +20,8 @@ define <8 x half> @sel.v8f16.fmul(ptr %p, ptr %q, <8 x half> %a, <8 x half> %b,
 define <4 x float> @sel.v4f32.fmul(ptr %p, ptr %q, <4 x float> %a, <4 x float> %b, <2 x float> %c) {
 ; CHECK-LABEL: sel.v4f32.fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    fmul v2.2s, v2.2s, v0.s[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> zeroinitializer
@@ -38,9 +36,8 @@ define <4 x float> @sel.v4f32.fmul(ptr %p, ptr %q, <4 x float> %a, <4 x float> %
 define <8 x i16> @sel.v8i16.mul(ptr %p, ptr %q, <8 x i16> %a, <8 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: sel.v8i16.mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v1.8h, v1.8h, v0.h[0]
 ; CHECK-NEXT:    mul v2.4h, v2.4h, v0.h[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.h[0]
 ; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -55,9 +52,8 @@ define <8 x i16> @sel.v8i16.mul(ptr %p, ptr %q, <8 x i16> %a, <8 x i16> %b, <4 x
 define <4 x i32> @sel.v4i32.mul(ptr %p, ptr %q, <4 x i32> %a, <4 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: sel.v4i32.mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v1.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    mul v2.2s, v2.2s, v0.s[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mul v0.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a5d7ae147ffda2..cd227a02357660 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1618,23 +1618,23 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
 ; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
@@ -1687,6 +1687,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w8
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
 ; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
@@ -1694,20 +1695,19 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
 ; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s3
 ; CHECK-GI-FP16-NEXT:    fmov s3, w7
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
 ; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
 ; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
 ; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 93d3d96d67b650..d4c4b461b82daf 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -671,35 +671,29 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -782,20 +776,17 @@ define <4 x half> @exp_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -912,40 +903,33 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1148,40 +1132,33 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1189,46 +1166,39 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
@@ -1941,35 +1911,29 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2052,20 +2016,17 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2182,40 +2143,33 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2418,40 +2372,33 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2459,46 +2406,39 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
@@ -3211,35 +3151,29 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -3322,20 +3256,17 @@ define <4 x half> @log_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -3452,40 +3383,33 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -3688,40 +3612,33 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -3729,46 +3646,39 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
@@ -4481,35 +4391,29 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -4592,20 +4496,17 @@ define <4 x half> @log2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -4722,40 +4623,33 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -4958,40 +4852,33 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -4999,46 +4886,39 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
@@ -5751,35 +5631,29 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -5862,20 +5736,17 @@ define <4 x half> @log10_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -5992,40 +5863,33 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -6228,40 +6092,33 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -6269,46 +6126,39 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index d942839c577d2b..b9cba619bfacc7 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -106,18 +106,15 @@ define <4 x fp128> @fpext_v4f16_v4f128(<4 x half> %a) {
 ; CHECK-SD-NEXT:    bl __extendhftf2
 ; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h1, v1.h[1]
-; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    mov h0, v1.h[1]
 ; CHECK-SD-NEXT:    bl __extendhftf2
 ; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h1, v1.h[2]
-; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    mov h0, v1.h[2]
 ; CHECK-SD-NEXT:    bl __extendhftf2
 ; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov h1, v1.h[3]
-; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    mov h0, v1.h[3]
 ; CHECK-SD-NEXT:    bl __extendhftf2
 ; CHECK-SD-NEXT:    mov v3.16b, v0.16b
 ; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
@@ -179,8 +176,7 @@ define <4 x fp128> @fpext_v4f32_v4f128(<4 x float> %a) {
 ; CHECK-SD-NEXT:    bl __extendsftf2
 ; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    mov s1, v1.s[1]
-; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    mov s0, v1.s[1]
 ; CHECK-SD-NEXT:    bl __extendsftf2
 ; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -302,18 +298,11 @@ entry:
 }
 
 define <4 x double> @fpext_v4f32_v4f64(<4 x float> %a) {
-; CHECK-SD-LABEL: fpext_v4f32_v4f64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fcvtl2 v1.2d, v0.4s
-; CHECK-SD-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fpext_v4f32_v4f64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvtl v2.2d, v0.2s
-; CHECK-GI-NEXT:    fcvtl2 v1.2d, v0.4s
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fpext_v4f32_v4f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ret
 entry:
   %c = fpext <4 x float> %a to <4 x double>
   ret <4 x double> %c
@@ -432,18 +421,11 @@ entry:
 }
 
 define <8 x float> @fpext_v8f16_v8f32(<8 x half> %a) {
-; CHECK-SD-LABEL: fpext_v8f16_v8f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fpext_v8f16_v8f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fpext_v8f16_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ret
 entry:
   %c = fpext <8 x half> %a to <8 x float>
   ret <8 x float> %c
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 8d40121ad4543f..f2f8ec56e32fc2 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -813,29 +813,25 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str h2, [sp, #174] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -956,23 +952,20 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1120,29 +1113,25 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str h2, [sp, #190] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1437,37 +1426,32 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-GI-NEXT:    str h1, [sp, #302] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    fcvt s1, h2
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp, #304] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    fcvt s2, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    str q0, [sp, #320] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    ldr h1, [sp, #272] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s2, h1
-; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    ldr h1, [sp, #272] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #272] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h1
+; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    ldr h1, [sp, #240] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s2, h1
-; CHECK-GI-NEXT:    fcvt s1, h11
+; CHECK-GI-NEXT:    ldr h1, [sp, #240] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #240] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h1
+; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #176] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 5dbcaa4a5fda17..dd59620eb020c0 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -736,41 +736,35 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -860,23 +854,20 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1004,47 +995,40 @@ define <8 x half> @powi_v8f16(<8 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1267,47 +1251,40 @@ define <16 x half> @powi_v16f16(<16 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #176] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1316,53 +1293,46 @@ define <16 x half> @powi_v16f16(<16 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #80] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #80] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #112] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #112] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldr h1, [sp, #176] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr h1, [sp, #176] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #144] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 1a10fd2f1cdc3d..3d87a9193f21cb 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -800,29 +800,25 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str h2, [sp, #174] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -943,23 +939,20 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1107,29 +1100,25 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h1
 ; CHECK-GI-NEXT:    str h2, [sp, #190] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1424,37 +1413,32 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-GI-NEXT:    str h1, [sp, #302] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    fcvt s1, h2
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    str q0, [sp, #304] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    fcvt s2, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    str q0, [sp, #320] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    ldr h1, [sp, #272] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s2, h1
-; CHECK-GI-NEXT:    fcvt s1, h10
+; CHECK-GI-NEXT:    ldr h1, [sp, #272] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #272] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h1
+; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    ldr h1, [sp, #240] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s2, h1
-; CHECK-GI-NEXT:    fcvt s1, h11
+; CHECK-GI-NEXT:    ldr h1, [sp, #240] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #240] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s2
+; CHECK-GI-NEXT:    fcvt s0, h1
+; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    ldr h1, [sp, #176] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 0b34f9570fa77b..0ed2d5398d726b 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -671,35 +671,29 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -782,20 +776,17 @@ define <4 x half> @sin_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -912,40 +903,33 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1148,40 +1132,33 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -1189,46 +1166,39 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
@@ -1941,35 +1911,29 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov h13, v0.h[6]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2052,20 +2016,17 @@ define <4 x half> @cos_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    mov h10, v0.h[3]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2182,40 +2143,33 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    mov h14, v0.h[7]
 ; CHECK-GI-NEXT:    fcvt s0, h0
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2418,40 +2372,33 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    mov h1, v2.h[7]
 ; CHECK-GI-NEXT:    str h1, [sp, #160] // 2-byte Folded Spill
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h15
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #192] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h15
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h8
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h9
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #208] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h10
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h11
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h11
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h12
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h12
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h13
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
@@ -2459,46 +2406,39 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    fcvt s1, h14
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h14
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #16] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #32] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #48] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #64] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #96] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    ldr h1, [sp, #160] // 2-byte Folded Reload
 ; CHECK-GI-NEXT:    str q0, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    fmov s0, s1
+; CHECK-GI-NEXT:    fcvt s0, h1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
index 8e29255189bf53..5465b2ef214e49 100644
--- a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
+++ b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
@@ -16,8 +16,8 @@ body:             |
     ; CHECK-LABEL: name: 1-ldrwpre-ldrwui-merge
     ; CHECK: liveins: $w0, $w1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $w0, renamable $w2 = LDPWpre renamable $x1, 5 :: (load (s32))
-    ; CHECK-NEXT: STPWi renamable $w0, killed renamable $w2, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: early-clobber $x1, renamable $w0, renamable $w2 = LDPWpre killed renamable $x1, 5 :: (load (s32))
+    ; CHECK-NEXT: STPWi killed renamable $w0, killed renamable $w2, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 20 :: (load (s32))
     renamable $w2 = LDRWui renamable $x1, 1 :: (load (s32))
@@ -42,8 +42,8 @@ body:             |
     ; CHECK-LABEL: name: 2-ldrxpre-ldrxui-merge
     ; CHECK: liveins: $x1, $x2, $x3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $x2, renamable $x3 = LDPXpre renamable $x1, 3 :: (load (s64))
-    ; CHECK-NEXT: STPXi renamable $x2, renamable $x3, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber $x1, renamable $x2, renamable $x3 = LDPXpre killed renamable $x1, 3 :: (load (s64))
+    ; CHECK-NEXT: STPXi killed renamable $x2, killed renamable $x3, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $x2 = LDRXpre killed renamable $x1, 24 :: (load (s64))
     renamable $x3 = LDRXui renamable $x1, 1 :: (load (s64))
@@ -68,8 +68,8 @@ body:             |
     ; CHECK-LABEL: name: 3-ldrspre-ldrsui-merge
     ; CHECK: liveins: $s0, $s1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 3 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre killed renamable $x1, 3 :: (load (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32))
     renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
@@ -94,8 +94,8 @@ body:             |
     ; CHECK-LABEL: name: 4-ldrqdre-ldrdui-merge
     ; CHECK: liveins: $d0, $d1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $d0, renamable $d1 = LDPDpre renamable $x1, 16 :: (load (s64))
-    ; CHECK-NEXT: STPDi renamable $d0, renamable $d1, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber $x1, renamable $d0, renamable $d1 = LDPDpre killed renamable $x1, 16 :: (load (s64))
+    ; CHECK-NEXT: STPDi killed renamable $d0, killed renamable $d1, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $d0 = LDRDpre killed renamable $x1, 128 :: (load (s64))
     renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64))
@@ -124,8 +124,8 @@ body:             |
     ; CHECK-LABEL: name: 5-ldrqpre-ldrqui-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 3 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 3 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128))
     renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128))
@@ -155,8 +155,8 @@ body:             |
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 48, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128))
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128))
@@ -185,8 +185,8 @@ body:             |
     ; CHECK-LABEL: name: 7-ldrqpre-ldrqui-max-offset-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 15 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 15 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 240 :: (load (s128))
     renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128))
@@ -215,8 +215,8 @@ body:             |
     ; CHECK-LABEL: name: 8-ldrqpre-ldrqui-min-offset-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, -16 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, -16 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, -256 :: (load (s128))
     renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128))
@@ -246,10 +246,10 @@ body:             |
     ; CHECK-LABEL: name: 9-ldrspre-ldrsui-mod-base-reg-no-merge
     ; CHECK: liveins: $s0, $s1, $x0, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: dead early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: renamable $x1 = LDRXui renamable $x0, 1 :: (load (s64))
+    ; CHECK-NEXT: dead early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: renamable $x1 = LDRXui killed renamable $x0, 1 :: (load (s64))
     ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32))
     renamable $x1 = LDRXui renamable $x0, 1 :: (load (s64))
@@ -280,11 +280,11 @@ body:             |
     ; CHECK-LABEL: name: 10-ldrspre-ldrsui-used-base-reg-no-merge
     ; CHECK: liveins: $s0, $s1, $x0, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32))
     ; CHECK-NEXT: renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64))
-    ; CHECK-NEXT: STRXui renamable $x0, renamable $x0, 1 :: (store (s64))
+    ; CHECK-NEXT: STRXui killed renamable $x0, renamable $x0, 1 :: (store (s64))
     ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32))
     renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64))
@@ -315,13 +315,13 @@ body:             |
     ; CHECK-LABEL: name: 11-ldrqpre-ldrqpre-no-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre renamable $x1, 48, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre renamable $x1, 1, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre renamable $x1, 12, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q1 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre killed renamable $x1, 48, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre killed renamable $x1, 1, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre killed renamable $x1, 12, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q1 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128))
     early-clobber renamable $x1, renamable $q1 = LDRQpre killed renamable $x1, 1  :: (load (s128))
@@ -352,9 +352,9 @@ body:             |
     ; CHECK-LABEL: name: 12-ldrspre-ldrsui-no-merge
     ; CHECK: liveins: $s0, $s1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32))
     ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32))
     renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32))
@@ -383,10 +383,10 @@ body:             |
     ; CHECK-LABEL: name: 13-ldrqpre-ldrdui-no-merge
     ; CHECK: liveins: $d1, $q0, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128))
     ; CHECK-NEXT: renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64))
-    ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128))
-    ; CHECK-NEXT: STRDui renamable $d1, renamable $x1, 1 :: (store (s64))
+    ; CHECK-NEXT: STRQui killed renamable $q0, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: STRDui killed renamable $d1, killed renamable $x1, 1 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64))
@@ -415,8 +415,8 @@ body:             |
     ; CHECK-LABEL: name: 14-ldrqpre-strqui-no-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: STRQui killed renamable $q0, killed renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     STRQui killed renamable $q0, renamable $x1, 0 :: (store (s128))
@@ -443,8 +443,8 @@ body:             |
     ; CHECK-LABEL: name: 15-ldrqpre-ldrqui-same-dst-reg-no-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, dead $q2, renamable $q0 = LDPQpre renamable $x1, 2 :: (load (s128))
-    ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber $x1, dead $q2, renamable $q0 = LDPQpre killed renamable $x1, 2 :: (load (s128))
+    ; CHECK-NEXT: STRQui killed renamable $q0, killed renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     renamable $q0 = LDRQui renamable $x1, 1 :: (load (s128))
@@ -473,9 +473,9 @@ body:             |
     ; CHECK-LABEL: name: 16-ldrqpre-ldrqui-diff-base-reg-no-merge
     ; CHECK: liveins: $q0, $q1, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128))
-    ; CHECK-NEXT: renamable $q1 = LDRQui renamable $x2, 1 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: renamable $q1 = LDRQui killed renamable $x2, 1 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     renamable $q1 = LDRQui renamable $x2, 1 :: (load (s128))
@@ -504,8 +504,8 @@ body:             |
     ; CHECK-LABEL: name: 17-ldrqpre-ldurqi-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 2 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 2 :: (load (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     renamable $q1 = LDURQi renamable $x1, 16 :: (load (s128))
@@ -534,9 +534,9 @@ body:             |
     ; CHECK-LABEL: name: 18-ldrqpre-ldurqi-no-merge
     ; CHECK: liveins: $q0, $q1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128))
     ; CHECK-NEXT: renamable $q1 = LDURQi renamable $x1, 1 :: (load (s128))
-    ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128))
+    ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128))
     renamable $q1 = LDURQi renamable $x1, 1 :: (load (s128))
@@ -561,8 +561,8 @@ body:             |
     ; CHECK-LABEL: name: 19-ldrspre-ldrsui-max-merge
     ; CHECK: liveins: $s0, $s1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 63 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre killed renamable $x1, 63 :: (load (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 252 :: (load (s32))
     renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
@@ -587,9 +587,9 @@ body:             |
     ; CHECK-LABEL: name: 20-ldrspre-ldrsui-unaligned-no-merge
     ; CHECK: liveins: $s0, $s1, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 251, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 251, implicit $w1 :: (load (s32))
     ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
-    ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32))
+    ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 251 :: (load (s32))
     renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32))
@@ -614,8 +614,8 @@ body:             |
     ; CHECK-LABEL: name: 21-ldrswpre-ldrswui-merge
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre renamable $x1, 10 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre killed renamable $x1, 10 :: (load (s32))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32))
     renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32))
@@ -640,8 +640,8 @@ body:             |
     ; CHECK-LABEL: name: 22-ldrswpre-ldurswi-merge
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre renamable $x1, 10 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre killed renamable $x1, 10 :: (load (s32))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32))
     renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32))
@@ -667,8 +667,8 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32))
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32))
@@ -694,8 +694,8 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32))
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32))
@@ -720,13 +720,13 @@ body:             |
     ; CHECK-LABEL: name: 25-ldrswpre-ldrswpre-no-merge
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre renamable $x1, 48, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre renamable $x1, 1, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre renamable $x1, 12, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x2 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre killed renamable $x1, 48, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre killed renamable $x1, 1, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre killed renamable $x1, 12, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x2 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 48 :: (load (s32))
     early-clobber renamable $x1, renamable $x2 = LDRSWpre killed renamable $x1, 1 :: (load (s32))
@@ -755,9 +755,9 @@ body:             |
     ; CHECK-LABEL: name: 26-ldrswpre-ldrwui-no-merge
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32))
     ; CHECK-NEXT: renamable $w2 = LDRWui renamable $x1, 1, implicit-def $x2 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32))
     renamable $w2 = LDRWui renamable $x1, 1 :: (load (s32))
@@ -782,9 +782,9 @@ body:             |
     ; CHECK-LABEL: name: 27-ldrwpre-ldrswui-no-merge
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x1, renamable $w0 = LDRWpre renamable $x1, 40, implicit $w1 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 40, implicit $w1 :: (load (s32))
     ; CHECK-NEXT: renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32))
-    ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64))
+    ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 40 :: (load (s32))
     renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32))
@@ -808,11 +808,11 @@ body:             |
     ; CHECK-LABEL: name: 28-ldrswpre-ldrwpre-no-merge
     ; CHECK: liveins: $x11, $x13
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre renamable $x11, 8, implicit $w11 :: (load (s32), align 8)
+    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre killed renamable $x11, 8, implicit $w11 :: (load (s32), align 8)
     ; CHECK-NEXT: $x14 = EORXrs renamable $x11, renamable $x13, 0
-    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre renamable $x11, 4, implicit $w11 :: (load (s32))
-    ; CHECK-NEXT: $x13 = EORXrs renamable $x11, renamable $x13, 0
-    ; CHECK-NEXT: STPXi renamable $x13, renamable $x14, renamable $x11, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre killed renamable $x11, 4, implicit $w11 :: (load (s32))
+    ; CHECK-NEXT: $x13 = EORXrs renamable $x11, killed renamable $x13, 0
+    ; CHECK-NEXT: STPXi killed renamable $x13, killed renamable $x14, renamable $x11, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x11, renamable $x10 = LDRSWpre killed renamable $x11, 8 :: (load (s32), align 8)
     $x14 = EORXrs renamable $x11, renamable $x13, 0
@@ -838,11 +838,11 @@ body:             |
     ; CHECK-LABEL: name: 29-ldrwpre-ldrswpre-no-merge
     ; CHECK: liveins: $x11, $x13
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre renamable $x11, 8, implicit $w11 :: (load (s32))
+    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre killed renamable $x11, 8, implicit $w11 :: (load (s32))
     ; CHECK-NEXT: $x14 = EORXrs renamable $x11, renamable $x13, 0
-    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre renamable $x11, 4, implicit $w11 :: (load (s32), align 8)
-    ; CHECK-NEXT: $x13 = EORXrs renamable $x11, renamable $x13, 0
-    ; CHECK-NEXT: STPXi renamable $x13, renamable $x14, renamable $x11, 0 :: (store (s64))
+    ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre killed renamable $x11, 4, implicit $w11 :: (load (s32), align 8)
+    ; CHECK-NEXT: $x13 = EORXrs renamable $x11, killed renamable $x13, 0
+    ; CHECK-NEXT: STPXi killed renamable $x13, killed renamable $x14, renamable $x11, 0 :: (store (s64))
     ; CHECK-NEXT: RET undef $lr
     early-clobber renamable $x11, renamable $w12 = LDRWpre killed renamable $x11, 8 :: (load (s32))
     $x14 = EORXrs renamable $x11, renamable $x13, 0
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 51d17ad0644f15..a09fb07d00d97e 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -104,10 +104,9 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) {
 ; GISEL-NEXT:    mov h8, v0.h[1]
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h8
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h8
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt h1, s0
 ; GISEL-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -177,15 +176,13 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; GISEL-NEXT:    mov h9, v0.h[2]
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h8
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h8
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h9
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h9
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
 ; GISEL-NEXT:    fcvt h0, s0
@@ -260,20 +257,17 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; GISEL-NEXT:    mov h10, v0.h[3]
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h8
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h8
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h9
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h9
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt s1, h10
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; GISEL-NEXT:    fmov s0, s1
+; GISEL-NEXT:    fcvt s0, h10
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
 ; GISEL-NEXT:    fcvt h0, s0
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c3c0ec5e3d9d8d..fad28d325b7d1e 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -216,10 +216,9 @@ define <3 x i8> @load_v3i8(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v3i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldrb w8, [x0]
 ; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
-; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ldrb w0, [x0]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i8>, ptr %ptr
     ret <3 x i8> %a
diff --git a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
index f394bc467af694..a2b6f45a8fa41f 100644
--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
@@ -11,7 +11,7 @@ define i32 @check_lr_liveness(ptr %arg) #1 {
   ; CHECK-NEXT:   successors: %bb.4(0x20000000), %bb.1(0x60000000)
   ; CHECK-NEXT:   liveins: $x0, $lr
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = COPY $x0
+  ; CHECK-NEXT:   renamable $x8 = COPY killed $x0
   ; CHECK-NEXT:   renamable $w0 = MOVi32imm -536870206
   ; CHECK-NEXT:   CBNZX killed renamable $x8, %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -38,7 +38,7 @@ define i32 @check_lr_liveness(ptr %arg) #1 {
   ; CHECK-NEXT: bb.3.bb2:
   ; CHECK-NEXT:   liveins: $w0, $lr
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+  ; CHECK-NEXT:   RET_ReallyLR implicit killed $w0
 bb:
   %icmp = icmp eq ptr %arg, null
   %or = or i1 %icmp, false
diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
index 5b379c2bd56292..e087f0eca75859 100644
--- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
+++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
@@ -10,13 +10,13 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w8 = ORRWrs $wzr, $w0, 0, implicit-def $x8
+  ; CHECK-NEXT:   $w8 = ORRWrs $wzr, killed $w0, 0, implicit-def $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   liveins: $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $x0 = ADDXri $x8, 1, 0
-  ; CHECK-NEXT:   RET undef $lr, implicit $x0
+  ; CHECK-NEXT:   $x0 = ADDXri killed $x8, 1, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $x0
   bb.0:
     successors: %bb.1(0x80000000)
     liveins: $w0
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
index e7e109170d6a12..338084295fc7f5 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) {
 ; CHECK-NEXT:    mov w9, wzr
 ; CHECK-NEXT:  LBB0_1: ; %.thread
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsr w11, w9, #1
 ; CHECK-NEXT:    sub w10, w9, #1
-; CHECK-NEXT:    mov w9, w11
+; CHECK-NEXT:    lsr w9, w9, #1
 ; CHECK-NEXT:    tbnz w10, #0, LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb343
 ; CHECK-NEXT:    and w9, w10, #0x1
-; CHECK-NEXT:    mov w0, #-1
+; CHECK-NEXT:    mov w0, #-1 ; =0xffffffff
 ; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:    ret
 bb:
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
index f2e62bc4f3c8c3..dda694dd2b3178 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@@ -50,11 +50,10 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
-; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
   %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
   ret <16 x float> %res
@@ -107,11 +106,10 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
-; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
   %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
   ret <16 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 6f4b090fb22bd6..9550c394c167d4 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -71,22 +71,19 @@ entry:
 define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
 ; CHECK-SD-LABEL: extadds_v32i8_i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
-; CHECK-SD-NEXT:    saddl v5.8h, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    saddl v2.8h, v1.8b, v3.8b
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    saddl2 v3.8h, v1.16b, v3.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extadds_v32i8_i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddl v4.8h, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    saddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    saddl v0.8h, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    saddl v2.8h, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    saddl2 v3.8h, v1.16b, v3.16b
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -99,22 +96,19 @@ entry:
 define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
 ; CHECK-SD-LABEL: extaddu_v32i8_i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
-; CHECK-SD-NEXT:    uaddl v5.8h, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    uaddl v2.8h, v1.8b, v3.8b
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    uaddl2 v3.8h, v1.16b, v3.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extaddu_v32i8_i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddl v4.8h, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    uaddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    uaddl v2.8h, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    uaddl2 v3.8h, v1.16b, v3.16b
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -821,22 +815,19 @@ entry:
 define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
 ; CHECK-SD-LABEL: extadds_v16i16_i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
-; CHECK-SD-NEXT:    saddl v5.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    saddl v2.4s, v1.4h, v3.4h
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    saddl2 v3.4s, v1.8h, v3.8h
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extadds_v16i16_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddl v4.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    saddl v0.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    saddl v2.4s, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    saddl2 v3.4s, v1.8h, v3.8h
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -849,22 +840,19 @@ entry:
 define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
 ; CHECK-SD-LABEL: extaddu_v16i16_i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
-; CHECK-SD-NEXT:    uaddl v5.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    uaddl v2.4s, v1.4h, v3.4h
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    uaddl2 v3.4s, v1.8h, v3.8h
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extaddu_v16i16_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddl v4.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    uaddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uaddl v0.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    uaddl v2.4s, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    uaddl2 v3.4s, v1.8h, v3.8h
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1093,22 +1081,19 @@ entry:
 define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
 ; CHECK-SD-LABEL: extadds_v8i32_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
-; CHECK-SD-NEXT:    saddl v5.2d, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    saddl v2.2d, v1.2s, v3.2s
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    saddl2 v3.2d, v1.4s, v3.4s
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extadds_v8i32_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddl v4.2d, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    saddl v0.2d, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    saddl v2.2d, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    saddl2 v3.2d, v1.4s, v3.4s
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1121,22 +1106,19 @@ entry:
 define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
 ; CHECK-SD-LABEL: extaddu_v8i32_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
-; CHECK-SD-NEXT:    uaddl v5.2d, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    uaddl v2.2d, v1.2s, v3.2s
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    uaddl2 v3.2d, v1.4s, v3.4s
 ; CHECK-SD-NEXT:    mov v1.16b, v6.16b
-; CHECK-SD-NEXT:    mov v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extaddu_v8i32_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddl v4.2d, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uaddl v0.2d, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    uaddl v2.2d, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    uaddl2 v3.2d, v1.4s, v3.4s
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index 3dbc033dfab964..cdd04eb3fa9c37 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -296,13 +296,12 @@ define <8 x i64> @extmuladds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
 ; CHECK-SD-LABEL: extmuladds_v8i8_i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-SD-NEXT:    saddw2 v5.2d, v5.2d, v6.4s
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    saddw v0.2d, v2.2d, v1.2s
 ; CHECK-SD-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-SD-NEXT:    saddw2 v3.2d, v5.2d, v6.4s
 ; CHECK-SD-NEXT:    saddw v2.2d, v4.2d, v6.2s
-; CHECK-SD-NEXT:    mov v3.16b, v5.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extmuladds_v8i8_i64:
@@ -334,13 +333,12 @@ define <8 x i64> @extmuladdu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
 ; CHECK-SD-LABEL: extmuladdu_v8i8_i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-SD-NEXT:    uaddw2 v5.2d, v5.2d, v6.4s
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    uaddw v0.2d, v2.2d, v1.2s
 ; CHECK-SD-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-SD-NEXT:    uaddw2 v3.2d, v5.2d, v6.4s
 ; CHECK-SD-NEXT:    uaddw v2.2d, v4.2d, v6.2s
-; CHECK-SD-NEXT:    mov v3.16b, v5.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: extmuladdu_v8i8_i64:
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 15763543113eb0..c93a8c14cf8f26 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -4092,10 +4092,9 @@ entry:
 define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
 ; CHECK-SD-LABEL: test_uzp:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    xtn v2.8b, v0.8h
 ; CHECK-SD-NEXT:    uzp2 v1.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT:    fmov d0, d2
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test_uzp:
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 529a3b72e09714..7d7b862098879c 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -337,9 +337,9 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    smov x8, v0.h[0]
 ; CHECK-GI-NEXT:    smov x9, v0.h[1]
 ; CHECK-GI-NEXT:    smov x10, v0.h[2]
+; CHECK-GI-NEXT:    smov x8, v0.h[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -362,9 +362,9 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    smov x9, v0.s[1]
 ; CHECK-GI-NEXT:    smov x10, v0.s[2]
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -547,18 +547,11 @@ entry:
 }
 
 define <4 x i64> @sext_v4i32_v4i64(<4 x i32> %a) {
-; CHECK-SD-LABEL: sext_v4i32_v4i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sext_v4i32_v4i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sext_v4i32_v4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
 entry:
   %c = sext <4 x i32> %a to <4 x i64>
   ret <4 x i64> %c
@@ -671,18 +664,11 @@ entry:
 }
 
 define <8 x i32> @sext_v8i16_v8i32(<8 x i16> %a) {
-; CHECK-SD-LABEL: sext_v8i16_v8i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sext_v8i16_v8i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sext_v8i16_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
 entry:
   %c = sext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %c
@@ -716,21 +702,19 @@ entry:
 define <8 x i64> @sext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v5.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v5.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -816,18 +800,11 @@ entry:
 }
 
 define <16 x i16> @sext_v16i8_v16i16(<16 x i8> %a) {
-; CHECK-SD-LABEL: sext_v16i8_v16i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sext_v16i8_v16i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sext_v16i8_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
 entry:
   %c = sext <16 x i8> %a to <16 x i16>
   ret <16 x i16> %c
@@ -902,21 +879,19 @@ entry:
 define <16 x i32> @sext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: sext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -964,30 +939,28 @@ entry:
 define <16 x i64> @sext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v17.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll2 v16.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    sshll v18.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v16.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    sshll v4.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v17.16b
+; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    mov v2.16b, v18.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v16.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll2 v19.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    sshll2 v17.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    sshll v18.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll2 v19.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v7.2d, v3.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v16.16b
+; CHECK-GI-NEXT:    sshll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    mov v1.16b, v17.16b
 ; CHECK-GI-NEXT:    mov v2.16b, v18.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v19.16b
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b1131f287fe9a9..56ff1c88e34f65 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -355,18 +355,11 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-SD-LABEL: shufflevector_v4i64:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; CHECK-SD-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    mov v1.16b, v2.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: shufflevector_v4i64:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    zip2 v1.2d, v2.2d, v3.2d
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    zip2 v1.2d, v2.2d, v3.2d
+; CHECK-NEXT:    ret
     %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
     ret <4 x i64> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 11cf388e385312..8df159da3c3bfc 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
 # RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
 --- |
@@ -34,14 +35,7 @@ body:             |
   bb.0.entry:
     liveins: $p0
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_ppr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr
-    ; EXPAND: STR_PXI $p0, $sp, 7
-    ; EXPAND: $p0 = LDR_PXI $sp, 7
 
     %0:ppr = COPY $p0
 
@@ -77,16 +71,7 @@ body:             |
   bb.0.entry:
     liveins: $p0_p1
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
-    ; EXPAND: STR_PXI $p0, $sp, 6
-    ; EXPAND: STR_PXI $p1, $sp, 7
-    ; EXPAND: $p0 = LDR_PXI $sp, 6
-    ; EXPAND: $p1 = LDR_PXI $sp, 7
 
     %0:ppr2 = COPY $p0_p1
 
@@ -122,16 +107,7 @@ body:             |
   bb.0.entry:
     liveins: $p0_p1
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
-    ; EXPAND: STR_PXI $p0, $sp, 6
-    ; EXPAND: STR_PXI $p1, $sp, 7
-    ; EXPAND: $p0 = LDR_PXI $sp, 6
-    ; EXPAND: $p1 = LDR_PXI $sp, 7
 
     %0:ppr2mul2 = COPY $p0_p1
 
@@ -167,14 +143,7 @@ body:             |
   bb.0.entry:
     liveins: $pn0
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_pnr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_pnr
-    ; EXPAND: STR_PXI $pn0, $sp, 7
-    ; EXPAND: $pn0 = LDR_PXI $sp, 7, implicit-def $pn0
 
     %0:pnr = COPY $pn0
 
@@ -206,17 +175,7 @@ registers:
 stack:
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
-    ; EXPAND: renamable $pn8 = WHILEGE_CXX_B
-    ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7
-    ;
-    ; EXPAND: renamable $pn8 = LDR_PXI $sp, 7
-    ; EXPAND: $p0 = PEXT_PCI_B killed renamable $pn8, 0
 
 
     %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv
@@ -253,14 +212,7 @@ body:             |
   bb.0.entry:
     liveins: $z0
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
 
     %0:zpr = COPY $z0
 
@@ -288,16 +240,7 @@ body:             |
   bb.0.entry:
     liveins: $z0_z1
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
 
     %0:zpr2 = COPY $z0_z1
 
@@ -333,16 +276,7 @@ body:             |
   bb.1:
     liveins: $z0_z8
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr2strided
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2strided
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z8, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z8 = LDR_ZXI $sp, 1
 
     %0:zpr2strided = COPY $z0_z8
 
@@ -370,18 +304,7 @@ body:             |
   bb.0.entry:
     liveins: $z0_z1_z2
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr3
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 48, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector
 
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr3
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
 
     %0:zpr3 = COPY $z0_z1_z2
 
@@ -409,20 +332,7 @@ body:             |
   bb.0.entry:
     liveins: $z0_z1_z2_z3
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector
-
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: STR_ZXI $z3, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
-    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+
 
     %0:zpr4 = COPY $z0_z1_z2_z3
 
@@ -457,20 +367,7 @@ body:             |
   bb.1:
     liveins: $z0_z4_z8_z12
 
-    ; CHECK-LABEL: name: spills_fills_stack_id_zpr4strided
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16
-    ; CHECK-NEXT:     stack-id: scalable-vector
-
-    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4strided
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z4, $sp, 1
-    ; EXPAND: STR_ZXI $z8, $sp, 2
-    ; EXPAND: STR_ZXI $z12, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z4 = LDR_ZXI $sp, 1
-    ; EXPAND: $z8 = LDR_ZXI $sp, 2
-    ; EXPAND: $z12 = LDR_ZXI $sp, 3
+
 
     %0:zpr4strided = COPY $z0_z4_z8_z12
 
@@ -486,3 +383,6 @@ body:             |
     $z0_z4_z8_z12 = COPY %0
     RET_ReallyLR
 ...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}
+# EXPAND: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index 20faeb23eed59d..0f8e8a8f06ce24 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -180,16 +180,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
-; CHECK-NO-SME-ROUTINES-NEXT:    adrp x8, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_2
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
 ; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_2: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT:    mov x0, x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
 ; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_4
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
index 88e13ea1e0fa4f..142fa413c7e55b 100644
--- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
@@ -192,9 +192,8 @@ define <vscale x 2 x i64> @zext_i32_i64(<vscale x 2 x i32> %a) {
 define <vscale x 16 x i16> @sext_b_to_h(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sext_b_to_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.h, z0.b
 ; CHECK-NEXT:    sunpkhi z1.h, z0.b
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    ret
   %ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   ret <vscale x 16 x i16> %ext
@@ -203,9 +202,8 @@ define <vscale x 16 x i16> @sext_b_to_h(<vscale x 16 x i8> %a) {
 define <vscale x 8 x i32> @sext_h_to_s(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sext_h_to_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    sunpkhi z1.s, z0.h
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
   %ext = sext <vscale x 8 x i16> %a to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %ext
@@ -214,9 +212,8 @@ define <vscale x 8 x i32> @sext_h_to_s(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i64> @sext_s_to_d(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sext_s_to_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.d, z0.s
 ; CHECK-NEXT:    sunpkhi z1.d, z0.s
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
   %ext = sext <vscale x 4 x i32> %a to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %ext
@@ -261,9 +258,8 @@ define <vscale x 16 x i64> @sext_b_to_d(<vscale x 16 x i8> %a) {
 define <vscale x 16 x i16> @zext_b_to_h(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: zext_b_to_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z2.h, z0.b
 ; CHECK-NEXT:    uunpkhi z1.h, z0.b
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ret
   %ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   ret <vscale x 16 x i16> %ext
@@ -272,9 +268,8 @@ define <vscale x 16 x i16> @zext_b_to_h(<vscale x 16 x i8> %a) {
 define <vscale x 8 x i32> @zext_h_to_s(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: zext_h_to_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
   %ext = zext <vscale x 8 x i16> %a to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %ext
@@ -283,9 +278,8 @@ define <vscale x 8 x i32> @zext_h_to_s(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i64> @zext_s_to_d(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: zext_s_to_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
   %ext = zext <vscale x 4 x i32> %a to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %ext
@@ -333,9 +327,8 @@ define <vscale x 4 x i64> @zext_4i8_4i64(<vscale x 4 x i8> %aval) {
 ; CHECK-LABEL: zext_4i8_4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
-; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
   %aext = zext <vscale x 4 x i8> %aval to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %aext
@@ -345,9 +338,8 @@ define <vscale x 4 x i64> @zext_4i16_4i64(<vscale x 4 x i16> %aval) {
 ; CHECK-LABEL: zext_4i16_4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
-; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
   %aext = zext <vscale x 4 x i16> %aval to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %aext
@@ -357,9 +349,8 @@ define <vscale x 8 x i32> @zext_8i8_8i32(<vscale x 8 x i8> %aval) {
 ; CHECK-LABEL: zext_8i8_8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
-; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
   %aext = zext <vscale x 8 x i8> %aval to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %aext
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 77aaeeadcfc2f0..da293ab51e73aa 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -3132,8 +3132,8 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
 ; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
-; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index fd1365d56fee47..eafb71a0b23a3e 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -171,10 +171,9 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, z6.d
 ; CHECK-NEXT:    ret
@@ -186,20 +185,16 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nx
 ; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z26.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z27.d, z6.d, z7.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp2 z28.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp2 z29.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z30.d, z4.d, z5.d
+; CHECK-NEXT:    uzp1 z3.d, z6.d, z7.d
 ; CHECK-NEXT:    uzp2 z7.d, z6.d, z7.d
-; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    uzp2 z6.d, z4.d, z5.d
 ; CHECK-NEXT:    mov z1.d, z24.d
-; CHECK-NEXT:    mov z2.d, z26.d
-; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    mov z4.d, z28.d
 ; CHECK-NEXT:    mov z5.d, z29.d
-; CHECK-NEXT:    mov z6.d, z30.d
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>}  %retval
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index e2c3b0abe21aab..fe089fa4a6417e 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -166,10 +166,9 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
 ; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z2.s, z0.s, z2.s
 ; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
+; CHECK-NEXT:    zip2 z1.s, z0.s, z2.s
 ; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
@@ -181,10 +180,9 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
 ; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z2.d, z0.d, z2.d
 ; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
+; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
 ; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index 3a481efd9785aa..9ef6047b0eb8c4 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -60,8 +60,7 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    uzp2 v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    st1 { v1.s }[2], [x8]
 ; CHECK-NEXT:    str d1, [x0]
-; CHECK-NEXT:    cmtst v2.4s, v2.4s, v2.4s
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    cmtst v0.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -79,8 +78,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    cmtst v2.4s, v2.4s, v2.4s
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    cmtst v0.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
   %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -143,15 +141,13 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    umull v5.2d, v0.2s, v2.2s
 ; CHECK-NEXT:    umull2 v6.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    umull v7.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mul v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    mul v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v6.4s
 ; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    cmtst v4.4s, v4.4s, v4.4s
-; CHECK-NEXT:    cmtst v5.4s, v5.4s, v5.4s
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    cmtst v0.4s, v4.4s, v4.4s
+; CHECK-NEXT:    cmtst v1.4s, v5.4s, v5.4s
 ; CHECK-NEXT:    ret
   %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
   %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 54ada05c904487..150a67ab7974dd 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4819,45 +4819,47 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ldr d3, [x10]
 ; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    ldr d2, [x10]
+; CHECK-GI-NEXT:    ldr d6, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ldr d6, [x11]
-; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
-; CHECK-GI-NEXT:    abs v5.4s, v5.4s
-; CHECK-GI-NEXT:    abs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    ldr d4, [x10]
 ; CHECK-GI-NEXT:    ldr d16, [x11]
+; CHECK-GI-NEXT:    abs v5.4s, v5.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
 ; CHECK-GI-NEXT:    abs v7.4s, v7.4s
-; CHECK-GI-NEXT:    abs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
+; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
 ; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
 ; CHECK-GI-NEXT:    ldr d6, [x10]
 ; CHECK-GI-NEXT:    ldr d17, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
-; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-GI-NEXT:    ldr d5, [x10]
+; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
 ; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
-; CHECK-GI-NEXT:    ldr d17, [x11, x8]
-; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
 ; CHECK-GI-NEXT:    ldr d7, [x10, x9]
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    ldr d17, [x11, x8]
+; CHECK-GI-NEXT:    abs v19.4s, v19.4s
+; CHECK-GI-NEXT:    abs v4.4s, v4.4s
 ; CHECK-GI-NEXT:    abs v16.4s, v16.4s
 ; CHECK-GI-NEXT:    abs v3.4s, v3.4s
 ; CHECK-GI-NEXT:    abs v18.4s, v18.4s
@@ -4865,36 +4867,33 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
 ; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
-; CHECK-GI-NEXT:    abs v19.4s, v19.4s
-; CHECK-GI-NEXT:    abs v4.4s, v4.4s
+; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
 ; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
 ; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
 ; CHECK-GI-NEXT:    abs v17.4s, v17.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    abs v6.4s, v6.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
-; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    addv s4, v4.4s
 ; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
 ; CHECK-GI-NEXT:    abs v16.4s, v16.4s
 ; CHECK-GI-NEXT:    abs v5.4s, v5.4s
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
-; CHECK-GI-NEXT:    addv s2, v2.4s
 ; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    addv s4, v4.4s
-; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    fmov w10, s4
 ; CHECK-GI-NEXT:    abs v18.4s, v18.4s
 ; CHECK-GI-NEXT:    abs v7.4s, v7.4s
 ; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    addv s3, v6.4s
 ; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    addv s3, v6.4s
 ; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    fmov w10, s4
 ; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
 ; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 0b90343a40c839..5414bacf42a92e 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -8,15 +8,14 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) {
 ; CHECK-NEXT:    cmhi.16b v0, v0, v5
 ; CHECK-NEXT:    sshll.8h v5, v0, #0
 ; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v16, v0, #0
 ; CHECK-NEXT:    sshll.4s v6, v5, #0
 ; CHECK-NEXT:    sshll.4s v7, v0, #0
 ; CHECK-NEXT:    sshll2.4s v5, v5, #0
-; CHECK-NEXT:    and.16b v4, v4, v16
+; CHECK-NEXT:    sshll2.4s v16, v0, #0
 ; CHECK-NEXT:    and.16b v0, v1, v6
 ; CHECK-NEXT:    and.16b v1, v2, v5
 ; CHECK-NEXT:    and.16b v2, v3, v7
-; CHECK-NEXT:    mov.16b v3, v4
+; CHECK-NEXT:    and.16b v3, v4, v16
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp ugt <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -333,8 +332,8 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_other_use:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.16b v16, #10
-; CHECK-NEXT:    ushll.8h v19, v0, #0
 ; CHECK-NEXT:    ldr q21, [sp]
+; CHECK-NEXT:    ushll.8h v19, v0, #0
 ; CHECK-NEXT:    ushll.4s v24, v19, #0
 ; CHECK-NEXT:    ushll2.4s v19, v19, #0
 ; CHECK-NEXT:    cmhi.16b v16, v0, v16
@@ -352,8 +351,8 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16
 ; CHECK-NEXT:    sshll2.2d v26, v17, #0
 ; CHECK-NEXT:    sshll.2d v27, v17, #0
 ; CHECK-NEXT:    and.16b v20, v21, v20
-; CHECK-NEXT:    sshll2.2d v21, v22, #0
 ; CHECK-NEXT:    and.16b v7, v7, v23
+; CHECK-NEXT:    sshll2.2d v21, v22, #0
 ; CHECK-NEXT:    sshll.2d v23, v22, #0
 ; CHECK-NEXT:    and.16b v6, v6, v26
 ; CHECK-NEXT:    sshll2.2d v26, v16, #0
@@ -361,16 +360,14 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16
 ; CHECK-NEXT:    stp q7, q20, [x0, #96]
 ; CHECK-NEXT:    sshll.2d v20, v16, #0
 ; CHECK-NEXT:    and.16b v21, v4, v21
-; CHECK-NEXT:    and.16b v4, v0, v18
 ; CHECK-NEXT:    and.16b v7, v3, v23
-; CHECK-NEXT:    and.16b v3, v19, v22
-; CHECK-NEXT:    stp q5, q6, [x0, #64]
+; CHECK-NEXT:    and.16b v3, v0, v18
 ; CHECK-NEXT:    and.16b v0, v24, v16
+; CHECK-NEXT:    stp q5, q6, [x0, #64]
 ; CHECK-NEXT:    and.16b v6, v2, v26
 ; CHECK-NEXT:    and.16b v2, v25, v17
 ; CHECK-NEXT:    and.16b v5, v1, v20
-; CHECK-NEXT:    mov.16b v1, v3
-; CHECK-NEXT:    mov.16b v3, v4
+; CHECK-NEXT:    and.16b v1, v19, v22
 ; CHECK-NEXT:    stp q7, q21, [x0, #32]
 ; CHECK-NEXT:    stp q5, q6, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 66bb131ce72494..38ad96df79cf6d 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2848,21 +2848,21 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT:    add x8, x0, #16
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
+; CHECK-BE-NEXT:    add x8, x0, #16
 ; CHECK-BE-NEXT:    umull v4.4s, v1.4h, v2.4h
-; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
-; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
+; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
 ; CHECK-BE-NEXT:    mov x0, x8
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
 ; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    b.ne .LBB24_1
@@ -2980,16 +2980,16 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    add x10, x0, #16
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
 ; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
 ; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
 ; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
 ; CHECK-BE-NEXT:    rev32 v17.8b, v17.8b
-; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
 ; CHECK-BE-NEXT:    umull v5.2d, v5.2s, v18.2s
+; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
 ; CHECK-BE-NEXT:    umull v18.2d, v21.2s, v22.2s
 ; CHECK-BE-NEXT:    ext v21.16b, v22.16b, v22.16b, #8
 ; CHECK-BE-NEXT:    rev32 v7.8b, v7.8b
@@ -2997,9 +2997,9 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    ext v16.16b, v16.16b, v16.16b, #8
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
-; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
 ; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
+; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
 ; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
 ; CHECK-BE-NEXT:    add x8, x0, #112
 ; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
@@ -3007,11 +3007,11 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    add x9, x0, #80
 ; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
 ; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #64
 ; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    mov x0, x8
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB25_1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index bb968c8eb00fcb..5cd7fd15735f3d 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -354,9 +354,9 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    umov w9, v0.h[1]
 ; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -379,9 +379,9 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -564,18 +564,11 @@ entry:
 }
 
 define <4 x i64> @zext_v4i32_v4i64(<4 x i32> %a) {
-; CHECK-SD-LABEL: zext_v4i32_v4i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v4i32_v4i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v4i32_v4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <4 x i32> %a to <4 x i64>
   ret <4 x i64> %c
@@ -696,18 +689,11 @@ entry:
 }
 
 define <8 x i32> @zext_v8i16_v8i32(<8 x i16> %a) {
-; CHECK-SD-LABEL: zext_v8i16_v8i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v8i16_v8i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v8i16_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %c
@@ -741,21 +727,19 @@ entry:
 define <8 x i64> @zext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v5.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -833,18 +817,11 @@ entry:
 }
 
 define <16 x i16> @zext_v16i8_v16i16(<16 x i8> %a) {
-; CHECK-SD-LABEL: zext_v16i8_v16i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v16i8_v16i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v16i8_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <16 x i8> %a to <16 x i16>
   ret <16 x i16> %c
@@ -919,21 +896,19 @@ entry:
 define <16 x i32> @zext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: zext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -981,30 +956,28 @@ entry:
 define <16 x i64> @zext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v17.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll2 v16.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v18.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll2 v16.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    ushll v4.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v17.16b
+; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    mov v2.16b, v18.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v16.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v19.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ushll2 v17.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    ushll v18.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll2 v19.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v7.2d, v3.4s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v16.16b
+; CHECK-GI-NEXT:    ushll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    mov v1.16b, v17.16b
 ; CHECK-GI-NEXT:    mov v2.16b, v18.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v19.16b
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index 5ebb115791c663..a14e943584f5f6 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -97,8 +97,7 @@ define <2 x i1> @saddo(ptr %ptr, ptr %ptr2) {
 ; CHECK-NEXT:    vand q9, q9, q10
 ; CHECK-NEXT:    vmvn q9, q9
 ; CHECK-NEXT:    vmovn.i64 d18, q9
-; CHECK-NEXT:    vmov r2, r1, d18
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d18
 ; CHECK-NEXT:    bx lr
   %x = load <2 x i64>, ptr %ptr, align 8
   %y = load <2 x i64>, ptr %ptr2, align 8
@@ -122,8 +121,7 @@ define <2 x i1> @ssubo(ptr %ptr, ptr %ptr2) {
 ; CHECK-NEXT:    vand q9, q9, q10
 ; CHECK-NEXT:    vmvn q9, q9
 ; CHECK-NEXT:    vmovn.i64 d18, q9
-; CHECK-NEXT:    vmov r2, r1, d18
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d18
 ; CHECK-NEXT:    bx lr
   %x = load <2 x i64>, ptr %ptr, align 8
   %y = load <2 x i64>, ptr %ptr2, align 8
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
index 78090083a00264..b8ea7c10ad2f4c 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
@@ -1268,7 +1268,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
-; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -1301,12 +1300,12 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s28
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
-; CHECK-NEON-NEXT:    vmov r1, s20
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
+; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
@@ -1514,7 +1513,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
-; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -1547,12 +1545,12 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s28
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
-; CHECK-NEON-NEXT:    vmov r1, s20
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
+; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
@@ -2459,24 +2457,23 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NEXT:    vorr q4, q0, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2lz
-; CHECK-NEXT:    vmov r2, r12, d9
 ; CHECK-NEXT:    mvn r5, #0
 ; CHECK-NEXT:    subs r3, r0, r5
-; CHECK-NEXT:    mov r6, #0
 ; CHECK-NEXT:    sbcs r3, r1, #0
-; CHECK-NEXT:    mov r4, #0
+; CHECK-NEXT:    mov r6, #0
 ; CHECK-NEXT:    mov r3, #0
+; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    movwlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    movne r3, r1
 ; CHECK-NEXT:    moveq r0, r5
 ; CHECK-NEXT:    rsbs r1, r0, #0
 ; CHECK-NEXT:    rscs r1, r3, #0
+; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    movwlt r6, #1
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    movne r6, r0
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r12
 ; CHECK-NEXT:    bl __aeabi_d2lz
 ; CHECK-NEXT:    subs r2, r0, r5
 ; CHECK-NEXT:    vmov.32 d0[0], r6
@@ -2640,9 +2637,9 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NEXT:    subs r3, r0, r7
 ; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    sbcs r3, r1, #0
-; CHECK-NEXT:    mov r10, #0
-; CHECK-NEXT:    mov r3, #0
 ; CHECK-NEXT:    vmov r9, s18
+; CHECK-NEXT:    mov r3, #0
+; CHECK-NEXT:    mov r10, #0
 ; CHECK-NEXT:    movwlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    movne r3, r1
@@ -2979,9 +2976,9 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEON-NEXT:    subs r3, r0, r7
 ; CHECK-NEON-NEXT:    mov r4, #0
 ; CHECK-NEON-NEXT:    sbcs r3, r1, #0
-; CHECK-NEON-NEXT:    mov r10, #0
-; CHECK-NEON-NEXT:    mov r3, #0
 ; CHECK-NEON-NEXT:    vmov r8, s18
+; CHECK-NEON-NEXT:    mov r3, #0
+; CHECK-NEON-NEXT:    mov r10, #0
 ; CHECK-NEON-NEXT:    movwlt r3, #1
 ; CHECK-NEON-NEXT:    cmp r3, #0
 ; CHECK-NEON-NEXT:    movne r3, r1
@@ -3260,7 +3257,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
-; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -3293,12 +3289,12 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s28
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
-; CHECK-NEON-NEXT:    vmov r1, s20
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
+; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
@@ -3503,7 +3499,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
-; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -3536,12 +3531,12 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s28
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
-; CHECK-NEON-NEXT:    vmov r1, s20
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
+; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll
index 5a7c4384428e1a..befbb90a1527bc 100644
--- a/llvm/test/CodeGen/ARM/funnel-shift.ll
+++ b/llvm/test/CodeGen/ARM/funnel-shift.ll
@@ -374,11 +374,10 @@ define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
 define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
 ; CHECK-LABEL: fshr_i64_const_overshift:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsl r2, r0, #23
 ; CHECK-NEXT:    lsl r1, r1, #23
-; CHECK-NEXT:    orr r2, r2, r3, lsr #9
+; CHECK-NEXT:    lsl r2, r0, #23
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #9
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    orr r0, r2, r3, lsr #9
 ; CHECK-NEXT:    bx lr
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
diff --git a/llvm/test/CodeGen/ARM/llvm.exp10.ll b/llvm/test/CodeGen/ARM/llvm.exp10.ll
index eb72fe8c1e1b78..0e057c48cb5f11 100644
--- a/llvm/test/CodeGen/ARM/llvm.exp10.ll
+++ b/llvm/test/CodeGen/ARM/llvm.exp10.ll
@@ -279,16 +279,12 @@ define <3 x double> @exp10_v3f64(<3 x double> %x) {
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl exp10
-; CHECK-NEXT:    ldrd r2, r3, [sp, #24]
 ; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    ldrd r0, r1, [sp, #24]
 ; CHECK-NEXT:    bl exp10
-; CHECK-NEXT:    ldrd r2, r3, [sp, #32]
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    ldrd r0, r1, [sp, #32]
 ; CHECK-NEXT:    vst1.64 {d8, d9}, [r4:128]!
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl exp10
 ; CHECK-NEXT:    strd r0, r1, [r4]
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -309,10 +305,8 @@ define <4 x double> @exp10_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    add r2, sp, #64
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
-; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    vmov r0, r1, d17
 ; CHECK-NEXT:    vmov r5, r8, d16
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl exp10
 ; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r6, r1
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 4b6d14efd0ecba..ff8ed66faca4d1 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -308,9 +308,7 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) {
 define i64 @load_i64_by_i8(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r2, [r0]
-; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    ldm r0, {r0, r1}
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8:
@@ -320,16 +318,12 @@ define i64 @load_i64_by_i8(ptr %arg) {
 ;
 ; CHECK-THUMBv6-LABEL: load_i64_by_i8:
 ; CHECK-THUMBv6:       @ %bb.0:
-; CHECK-THUMBv6-NEXT:    ldr r2, [r0]
-; CHECK-THUMBv6-NEXT:    ldr r1, [r0, #4]
-; CHECK-THUMBv6-NEXT:    mov r0, r2
+; CHECK-THUMBv6-NEXT:    ldm r0, {r0, r1}
 ; CHECK-THUMBv6-NEXT:    bx lr
 ;
 ; CHECK-THUMBv7-LABEL: load_i64_by_i8:
 ; CHECK-THUMBv7:       @ %bb.0:
-; CHECK-THUMBv7-NEXT:    ldr r2, [r0]
-; CHECK-THUMBv7-NEXT:    ldr r1, [r0, #4]
-; CHECK-THUMBv7-NEXT:    mov r0, r2
+; CHECK-THUMBv7-NEXT:    ldm r0, {r0, r1}
 ; CHECK-THUMBv7-NEXT:    bx lr
 
   %tmp1 = load i8, ptr %arg, align 8
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 0f6ec8aa473860..b0fa25610fc157 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -170,9 +170,7 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
 define i64 @load_i64_by_i8(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r2, [r0]
-; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    ldm r0, {r0, r1}
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8:
@@ -182,9 +180,7 @@ define i64 @load_i64_by_i8(ptr %arg) {
 ;
 ; CHECK-THUMBv6-LABEL: load_i64_by_i8:
 ; CHECK-THUMBv6:       @ %bb.0:
-; CHECK-THUMBv6-NEXT:    ldr r2, [r0]
-; CHECK-THUMBv6-NEXT:    ldr r1, [r0, #4]
-; CHECK-THUMBv6-NEXT:    mov r0, r2
+; CHECK-THUMBv6-NEXT:    ldm r0, {r0, r1}
 ; CHECK-THUMBv6-NEXT:    bx lr
 ;
 ; CHECK-THUMBv7-LABEL: load_i64_by_i8:
diff --git a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll
index 046bbbde686426..b9ca841aa317cf 100644
--- a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -186,9 +186,7 @@ define double @double_sub(i32 %a, i32 %b, double %x, double %y) {
 ; CHECK-V7-NEXT:    movw r1, :lower16:t
 ; CHECK-V7-NEXT:    movt r1, :upper16:t
 ; CHECK-V7-NEXT:    str r0, [r1]
-; CHECK-V7-NEXT:    vmov r2, r3, d16
-; CHECK-V7-NEXT:    mov r0, r2
-; CHECK-V7-NEXT:    mov r1, r3
+; CHECK-V7-NEXT:    vmov r0, r1, d16
 ; CHECK-V7-NEXT:    bx lr
 ;
 ; CHECK-V8-LABEL: double_sub:
@@ -197,13 +195,11 @@ define double @double_sub(i32 %a, i32 %b, double %x, double %y) {
 ; CHECK-V8-NEXT:    cmp r0, r1
 ; CHECK-V8-NEXT:    vmov d17, r2, r3
 ; CHECK-V8-NEXT:    sub r0, r0, r1
-; CHECK-V8-NEXT:    vselgt.f64 d16, d17, d16
 ; CHECK-V8-NEXT:    movw r1, :lower16:t
-; CHECK-V8-NEXT:    vmov r2, r3, d16
+; CHECK-V8-NEXT:    vselgt.f64 d16, d17, d16
 ; CHECK-V8-NEXT:    movt r1, :upper16:t
 ; CHECK-V8-NEXT:    str r0, [r1]
-; CHECK-V8-NEXT:    mov r0, r2
-; CHECK-V8-NEXT:    mov r1, r3
+; CHECK-V8-NEXT:    vmov r0, r1, d16
 ; CHECK-V8-NEXT:    bx lr
 entry:
   %cmp = icmp sgt i32 %a, %b
@@ -224,9 +220,7 @@ define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) {
 ; CHECK-V7-NEXT:    movw r1, :lower16:t
 ; CHECK-V7-NEXT:    movt r1, :upper16:t
 ; CHECK-V7-NEXT:    str r0, [r1]
-; CHECK-V7-NEXT:    vmov r2, r3, d16
-; CHECK-V7-NEXT:    mov r0, r2
-; CHECK-V7-NEXT:    mov r1, r3
+; CHECK-V7-NEXT:    vmov r0, r1, d16
 ; CHECK-V7-NEXT:    bx lr
 ;
 ; CHECK-V8-LABEL: double_sub_swap:
@@ -235,13 +229,11 @@ define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) {
 ; CHECK-V8-NEXT:    cmp r1, r0
 ; CHECK-V8-NEXT:    vmov d17, r2, r3
 ; CHECK-V8-NEXT:    sub r0, r1, r0
-; CHECK-V8-NEXT:    vselge.f64 d16, d16, d17
 ; CHECK-V8-NEXT:    movw r1, :lower16:t
-; CHECK-V8-NEXT:    vmov r2, r3, d16
+; CHECK-V8-NEXT:    vselge.f64 d16, d16, d17
 ; CHECK-V8-NEXT:    movt r1, :upper16:t
 ; CHECK-V8-NEXT:    str r0, [r1]
-; CHECK-V8-NEXT:    mov r0, r2
-; CHECK-V8-NEXT:    mov r1, r3
+; CHECK-V8-NEXT:    vmov r0, r1, d16
 ; CHECK-V8-NEXT:    bx lr
 entry:
   %cmp = icmp sgt i32 %a, %b
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
index fe81324d6679bc..c40dd2e9229633 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
@@ -96,15 +96,14 @@ define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind {
 ; CHECK-NEXT:    push {r4, r5, r11, lr}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    ldr r12, [sp, #32]
-; CHECK-NEXT:    ldr lr, [sp, #36]
-; CHECK-NEXT:    ldr r4, [sp, #40]
-; CHECK-NEXT:    ldr r5, [sp, #44]
-; CHECK-NEXT:    stm sp, {r0, r1, r2, r3}
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    mov r2, r4
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    str r2, [sp, #8]
+; CHECK-NEXT:    str r3, [sp, #12]
+; CHECK-NEXT:    ldr r0, [sp, #32]
+; CHECK-NEXT:    ldr r1, [sp, #36]
+; CHECK-NEXT:    ldr r2, [sp, #40]
+; CHECK-NEXT:    ldr r3, [sp, #44]
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
@@ -194,15 +193,14 @@ define fp128 @test_v2f128(<2 x fp128> %a, fp128 %s) nounwind {
 ; CHECK-NEXT:    push {r4, r5, r11, lr}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    ldr r12, [sp, #48]
-; CHECK-NEXT:    ldr lr, [sp, #52]
-; CHECK-NEXT:    ldr r4, [sp, #56]
-; CHECK-NEXT:    ldr r5, [sp, #60]
-; CHECK-NEXT:    stm sp, {r0, r1, r2, r3}
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    mov r2, r4
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    str r1, [sp, #4]
+; CHECK-NEXT:    str r2, [sp, #8]
+; CHECK-NEXT:    str r3, [sp, #12]
+; CHECK-NEXT:    ldr r0, [sp, #48]
+; CHECK-NEXT:    ldr r1, [sp, #52]
+; CHECK-NEXT:    ldr r2, [sp, #56]
+; CHECK-NEXT:    ldr r3, [sp, #60]
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr r4, [sp, #32]
 ; CHECK-NEXT:    ldr r5, [sp, #40]
diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll
index c43cb623f585c2..659b2747cd6f64 100644
--- a/llvm/test/CodeGen/ARM/vlddup.ll
+++ b/llvm/test/CodeGen/ARM/vlddup.ll
@@ -21,8 +21,7 @@ define <8 x i8> @vld1dupi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind {
 ; CHECK-NEXT:    add r3, r2, r1
 ; CHECK-NEXT:    str r3, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[]}, [r3]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = load ptr, ptr %a, align 4
@@ -40,8 +39,7 @@ define <8 x i8> @vld1dupi8_postinc_fixed(ptr noalias nocapture %a) nounwind {
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[]}, [r3]!
 ; CHECK-NEXT:    str r3, [r0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = load ptr, ptr %a, align 4
@@ -59,8 +57,7 @@ define <8 x i8> @vld1dupi8_postinc_register(ptr noalias nocapture %a, i32 %n) no
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[]}, [r3], r1
 ; CHECK-NEXT:    str r3, [r0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = load ptr, ptr %a, align 4
@@ -81,9 +78,8 @@ define <16 x i8> @vld1dupqi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind {
 ; CHECK-NEXT:    add lr, r2, r1
 ; CHECK-NEXT:    str lr, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[], d17[]}, [lr]
-; CHECK-NEXT:    vmov r12, r1, d16
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 entry:
@@ -104,9 +100,8 @@ define <16 x i8> @vld1dupqi8_postinc_fixed(ptr noalias nocapture %a) nounwind {
 ; CHECK-NEXT:    ldr lr, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[], d17[]}, [lr]!
 ; CHECK-NEXT:    str lr, [r0]
-; CHECK-NEXT:    vmov r12, r1, d16
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 entry:
@@ -127,9 +122,8 @@ define <16 x i8> @vld1dupqi8_postinc_register(ptr noalias nocapture %a, i32 %n)
 ; CHECK-NEXT:    ldr lr, [r0]
 ; CHECK-NEXT:    vld1.8 {d16[], d17[]}, [lr], r1
 ; CHECK-NEXT:    str lr, [r0]
-; CHECK-NEXT:    vmov r12, r1, d16
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 entry:
@@ -420,8 +414,7 @@ define <4 x i16> @vld2dupi16_update(ptr %ptr) nounwind {
 ; CHECK-NEXT:    vadd.i16 d16, d16, d17
 ; CHECK-NEXT:    str r3, [r0]
 ; CHECK-NEXT:    vdup.16 d16, d16[0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%A = load ptr, ptr %ptr
 	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
@@ -444,8 +437,7 @@ define <4 x i16> @vld2dupi16_odd_update(ptr %ptr) nounwind {
 ; CHECK-NEXT:    vadd.i16 d16, d16, d17
 ; CHECK-NEXT:    str r3, [r0]
 ; CHECK-NEXT:    vdup.16 d16, d16[0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%A = load ptr, ptr %ptr
 	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
@@ -494,8 +486,7 @@ define <8 x i8> @vld3dupi8_update(ptr %ptr, i32 %inc) nounwind {
 ; CHECK-NEXT:    vadd.i8 d16, d20, d18
 ; CHECK-NEXT:    str r3, [r0]
 ; CHECK-NEXT:    vdup.8 d16, d16[0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%A = load ptr, ptr %ptr
 	%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
@@ -551,8 +542,7 @@ define <4 x i16> @vld4dupi16_update(ptr %ptr) nounwind {
 ; CHECK-NEXT:    str r3, [r0]
 ; CHECK-NEXT:    vadd.i16 d16, d16, d20
 ; CHECK-NEXT:    vdup.16 d16, d16[0]
-; CHECK-NEXT:    vmov r2, r1, d16
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%A = load ptr, ptr %ptr
 	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll
index f6ddc9988877e3..b7d3e5ed8e9d72 100644
--- a/llvm/test/CodeGen/ARM/vldlane.ll
+++ b/llvm/test/CodeGen/ARM/vldlane.ll
@@ -199,8 +199,7 @@ define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind {
 ; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3]!
 ; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
 ; DEFAULT-NEXT:    str r3, [r0]
-; DEFAULT-NEXT:    vmov r2, r1, d16
-; DEFAULT-NEXT:    mov r0, r2
+; DEFAULT-NEXT:    vmov r0, r1, d16
 ; DEFAULT-NEXT:    mov pc, lr
 ;
 ; BASIC-LABEL: vld2lanei32_update:
@@ -213,9 +212,7 @@ define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind {
 ; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]!
 ; BASIC-NEXT:    vadd.i32 d16, d16, d17
 ; BASIC-NEXT:    str r0, [r1]
-; BASIC-NEXT:    vmov r2, r3, d16
-; BASIC-NEXT:    mov r0, r2
-; BASIC-NEXT:    mov r1, r3
+; BASIC-NEXT:    vmov r0, r1, d16
 ; BASIC-NEXT:    mov pc, lr
   %A = load ptr, ptr %ptr
   %tmp1 = load <2 x i32>, ptr %B
@@ -238,8 +235,7 @@ define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind {
 ; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3], r1
 ; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
 ; DEFAULT-NEXT:    str r3, [r0]
-; DEFAULT-NEXT:    vmov r2, r1, d16
-; DEFAULT-NEXT:    mov r0, r2
+; DEFAULT-NEXT:    vmov r0, r1, d16
 ; DEFAULT-NEXT:    mov pc, lr
 ;
 ; BASIC-LABEL: vld2lanei32_odd_update:
@@ -253,9 +249,7 @@ define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind {
 ; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0], r2
 ; BASIC-NEXT:    vadd.i32 d16, d16, d17
 ; BASIC-NEXT:    str r0, [r1]
-; BASIC-NEXT:    vmov r2, r3, d16
-; BASIC-NEXT:    mov r0, r2
-; BASIC-NEXT:    mov r1, r3
+; BASIC-NEXT:    vmov r0, r1, d16
 ; BASIC-NEXT:    mov pc, lr
   %A = load ptr, ptr %ptr
   %tmp1 = load <2 x i32>, ptr %B
@@ -538,9 +532,8 @@ define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind {
 ; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
 ; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
 ; DEFAULT-NEXT:    str lr, [r0]
-; DEFAULT-NEXT:    vmov r12, r1, d16
+; DEFAULT-NEXT:    vmov r0, r1, d16
 ; DEFAULT-NEXT:    vmov r2, r3, d17
-; DEFAULT-NEXT:    mov r0, r12
 ; DEFAULT-NEXT:    pop {r11, lr}
 ; DEFAULT-NEXT:    mov pc, lr
 ;
@@ -558,11 +551,8 @@ define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind {
 ; BASIC-NEXT:    vadd.i16 q8, q9, q10
 ; BASIC-NEXT:    vadd.i16 q8, q11, q8
 ; BASIC-NEXT:    str r0, [r3]
-; BASIC-NEXT:    vmov r1, lr, d16
-; BASIC-NEXT:    vmov r2, r12, d17
-; BASIC-NEXT:    mov r0, r1
-; BASIC-NEXT:    mov r1, lr
-; BASIC-NEXT:    mov r3, r12
+; BASIC-NEXT:    vmov r0, r1, d16
+; BASIC-NEXT:    vmov r2, r3, d17
 ; BASIC-NEXT:    pop {r11, lr}
 ; BASIC-NEXT:    mov pc, lr
   %A = load ptr, ptr %ptr
@@ -704,8 +694,7 @@ define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind {
 ; DEFAULT-NEXT:    vadd.i8 d20, d18, d19
 ; DEFAULT-NEXT:    str r3, [r0]
 ; DEFAULT-NEXT:    vadd.i8 d16, d16, d20
-; DEFAULT-NEXT:    vmov r2, r1, d16
-; DEFAULT-NEXT:    mov r0, r2
+; DEFAULT-NEXT:    vmov r0, r1, d16
 ; DEFAULT-NEXT:    mov pc, lr
 ;
 ; BASIC-LABEL: vld4lanei8_update:
@@ -721,9 +710,7 @@ define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind {
 ; BASIC-NEXT:    vadd.i8 d20, d18, d19
 ; BASIC-NEXT:    str r0, [r3]
 ; BASIC-NEXT:    vadd.i8 d16, d16, d20
-; BASIC-NEXT:    vmov r1, r2, d16
-; BASIC-NEXT:    mov r0, r1
-; BASIC-NEXT:    mov r1, r2
+; BASIC-NEXT:    vmov r0, r1, d16
 ; BASIC-NEXT:    mov pc, lr
   %A = load ptr, ptr %ptr
   %tmp1 = load <8 x i8>, ptr %B
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index f032756e007b68..51589f216a3625 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -326,9 +326,8 @@ define i64 @sra(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sra a1, a1, a2
 ; RV32I-NEXT:    bltz a4, .LBB16_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a3, a3, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    srai a1, a3, 31
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB16_2:
 ; RV32I-NEXT:    srl a0, a0, a2
@@ -437,9 +436,8 @@ define i64 @sraiw_i64(i64 %a) nounwind {
 ;
 ; RV32I-LABEL: sraiw_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    srai a2, a0, 9
 ; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    srai a0, a0, 9
 ; RV32I-NEXT:    ret
   %1 = shl i64 %a, 32
   %2 = ashr i64 %1, 41
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb37757..b77809308ad9c7 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -129,9 +129,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV32-NEXT:    lw a3, 0(a1)
 ; RV32-NEXT:    addi a4, a1, 4
 ; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    addi a0, a0, 4
 ; RV32-NEXT:    mv a1, a4
 ; RV32-NEXT:    bne a4, a2, .LBB3_2
 ; RV32-NEXT:  .LBB3_3: # %while.end
@@ -153,9 +152,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV64-NEXT:    lw a3, 0(a1)
 ; RV64-NEXT:    addi a4, a1, 4
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    sw a3, 0(a0)
-; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    addi a0, a0, 4
 ; RV64-NEXT:    mv a1, a4
 ; RV64-NEXT:    bne a4, a2, .LBB3_2
 ; RV64-NEXT:  .LBB3_3: # %while.end
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 622365cf13bcef..650b57acafc2b0 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -455,9 +455,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
 ; RV32I-NEXT:    sub a0, a1, a3
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    sub a1, a2, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sub1:
@@ -473,9 +472,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a0
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    sub a1, a2, a5
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: sub1:
@@ -496,9 +494,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a0, a4, a0
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
-; RV32ZICOND-NEXT:    mv a1, a2
+; RV32ZICOND-NEXT:    sub a1, a2, a5
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: sub1:
@@ -519,9 +516,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
 ; RV32I-NEXT:    sub a0, a1, a3
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    sub a1, a2, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sub2:
@@ -537,9 +533,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a0
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    sub a1, a2, a5
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: sub2:
@@ -560,9 +555,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.nez a0, a4, a0
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
-; RV32ZICOND-NEXT:    mv a1, a2
+; RV32ZICOND-NEXT:    sub a1, a2, a5
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: sub2:
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
index e864d8fb0eddd5..8f88fbf4c05867 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
@@ -288,9 +288,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    csrr a4, fflags
 ; RV32IZFINXZDINX-NEXT:    flt.d a6, a2, a0
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
-; RV32IZFINXZDINX-NEXT:    or a4, a6, a5
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a6, a5
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_one:
@@ -302,9 +301,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    csrr a2, fflags
 ; RV64IZFINXZDINX-NEXT:    flt.d a4, a1, a0
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
-; RV64IZFINXZDINX-NEXT:    or a2, a4, a3
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    or a0, a4, a3
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_one:
@@ -438,9 +436,8 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    flt.d a6, a2, a0
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
 ; RV32IZFINXZDINX-NEXT:    or a4, a6, a5
-; RV32IZFINXZDINX-NEXT:    xori a4, a4, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a4, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ueq:
@@ -453,9 +450,8 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    flt.d a4, a1, a0
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
 ; RV64IZFINXZDINX-NEXT:    or a3, a4, a3
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ueq:
@@ -531,9 +527,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    csrr a4, fflags
 ; RV32IZFINXZDINX-NEXT:    fle.d a5, a0, a2
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a0, a2
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ugt:
@@ -541,9 +536,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    csrr a2, fflags
 ; RV64IZFINXZDINX-NEXT:    fle.d a3, a0, a1
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a0, a1
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ugt:
@@ -585,9 +579,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    csrr a4, fflags
 ; RV32IZFINXZDINX-NEXT:    flt.d a5, a0, a2
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a0, a2
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_uge:
@@ -595,9 +588,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    csrr a2, fflags
 ; RV64IZFINXZDINX-NEXT:    flt.d a3, a0, a1
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a0, a1
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_uge:
@@ -641,9 +633,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    csrr a4, fflags
 ; RV32IZFINXZDINX-NEXT:    fle.d a5, a2, a0
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ult:
@@ -651,9 +642,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    csrr a2, fflags
 ; RV64IZFINXZDINX-NEXT:    fle.d a3, a1, a0
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ult:
@@ -695,9 +685,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    csrr a4, fflags
 ; RV32IZFINXZDINX-NEXT:    flt.d a5, a2, a0
 ; RV32IZFINXZDINX-NEXT:    csrw fflags, a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ule:
@@ -705,9 +694,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    csrr a2, fflags
 ; RV64IZFINXZDINX-NEXT:    flt.d a3, a1, a0
 ; RV64IZFINXZDINX-NEXT:    csrw fflags, a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ule:
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
index dae9f3e089cf4a..6c75edb479252c 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
@@ -247,9 +247,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    csrr a2, fflags
 ; CHECKIZFINX-NEXT:    flt.s a4, a1, a0
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
-; CHECKIZFINX-NEXT:    or a2, a4, a3
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    or a0, a4, a3
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_one:
@@ -368,9 +367,8 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    flt.s a4, a1, a0
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
 ; CHECKIZFINX-NEXT:    or a3, a4, a3
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ueq:
@@ -438,9 +436,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    csrr a2, fflags
 ; CHECKIZFINX-NEXT:    fle.s a3, a0, a1
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a0, a1
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ugt:
@@ -482,9 +479,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    csrr a2, fflags
 ; CHECKIZFINX-NEXT:    flt.s a3, a0, a1
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a0, a1
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_uge:
@@ -528,9 +524,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    csrr a2, fflags
 ; CHECKIZFINX-NEXT:    fle.s a3, a1, a0
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ult:
@@ -572,9 +567,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    csrr a2, fflags
 ; CHECKIZFINX-NEXT:    flt.s a3, a1, a0
 ; CHECKIZFINX-NEXT:    csrw fflags, a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ule:
diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
index d96c39c504e1fd..f4f1279a8e18ed 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
@@ -235,9 +235,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    csrr a2, fflags
 ; CHECKIZHINX-NEXT:    flt.h a4, a1, a0
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
-; CHECKIZHINX-NEXT:    or a2, a4, a3
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    or a0, a4, a3
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_one:
@@ -334,9 +333,8 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    flt.h a4, a1, a0
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
 ; CHECKIZHINX-NEXT:    or a3, a4, a3
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ueq:
@@ -388,9 +386,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    csrr a2, fflags
 ; CHECKIZHINX-NEXT:    fle.h a3, a0, a1
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a0, a1
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ugt:
@@ -432,9 +429,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    csrr a2, fflags
 ; CHECKIZHINX-NEXT:    flt.h a3, a0, a1
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a0, a1
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_uge:
@@ -476,9 +472,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    csrr a2, fflags
 ; CHECKIZHINX-NEXT:    fle.h a3, a1, a0
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ult:
@@ -520,9 +515,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    csrr a2, fflags
 ; CHECKIZHINX-NEXT:    flt.h a3, a1, a0
 ; CHECKIZHINX-NEXT:    csrw fflags, a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ule:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 30f9dd1e516585..23c885a1d2cb62 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -641,8 +641,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -691,8 +691,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    mv s3, a0
+; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    mv a1, sp
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
@@ -741,10 +741,9 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    lw s0, 12(a1)
 ; RV32I-NEXT:    lw s1, 8(a1)
 ; RV32I-NEXT:    lw s2, 4(a1)
-; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
@@ -791,10 +790,9 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    lw s0, 24(a1)
 ; RV64I-NEXT:    lw s1, 16(a1)
 ; RV64I-NEXT:    lw s2, 8(a1)
-; RV64I-NEXT:    lw a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lw a0, 0(a1)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
@@ -925,8 +923,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -967,8 +965,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    mv s3, a0
+; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    mv a1, sp
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
@@ -1009,10 +1007,9 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32I-NEXT:    lw s0, 12(a1)
 ; RV32I-NEXT:    lw s1, 8(a1)
 ; RV32I-NEXT:    lw s2, 4(a1)
-; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
@@ -1051,10 +1048,9 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64I-NEXT:    lw s0, 24(a1)
 ; RV64I-NEXT:    lw s1, 16(a1)
 ; RV64I-NEXT:    lw s2, 8(a1)
-; RV64I-NEXT:    lw a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lw a0, 0(a1)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
@@ -1175,8 +1171,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -1216,8 +1212,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    mv s3, a0
+; RV64IZFINXZDINX-NEXT:    mv a2, a1
 ; RV64IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
@@ -1257,10 +1253,9 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32I-NEXT:    lw s0, 12(a1)
 ; RV32I-NEXT:    lw s1, 8(a1)
 ; RV32I-NEXT:    lw s2, 4(a1)
-; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s2
@@ -1298,10 +1293,9 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64I-NEXT:    lw s0, 24(a1)
 ; RV64I-NEXT:    lw s1, 16(a1)
 ; RV64I-NEXT:    lw s2, 8(a1)
-; RV64I-NEXT:    lw a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lw a0, 0(a1)
 ; RV64I-NEXT:    addi a1, sp, 8
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    mv a0, s2
diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir
index f3674f89cd918b..ce3d71f55e0651 100644
--- a/llvm/test/CodeGen/RISCV/machine-cp.mir
+++ b/llvm/test/CodeGen/RISCV/machine-cp.mir
@@ -19,14 +19,15 @@ body:             |
     ; RV32: liveins: $v28_v29_v30, $v8_v9, $v1
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7
-    ; RV32-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
-    ; RV32-NEXT: PseudoRET implicit $v28
+    ; RV32-NEXT: renamable $v28 = COPY killed renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
+    ; RV32-NEXT: PseudoRET implicit killed $v28
+    ;
     ; RV64-LABEL: name: foo
     ; RV64: liveins: $v28_v29_v30, $v8_v9, $v1
     ; RV64-NEXT: {{  $}}
     ; RV64-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7
-    ; RV64-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
-    ; RV64-NEXT: PseudoRET implicit $v28
+    ; RV64-NEXT: renamable $v28 = COPY killed renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
+    ; RV64-NEXT: PseudoRET implicit killed $v28
     renamable $v8 = COPY renamable $v1, implicit killed $v8_v9, implicit-def $v8_v9
     renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7
     renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 6f301882b452c0..48f725dba5dccb 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -211,10 +211,9 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32I-NEXT:    sw a0, 0(a2)
 ; RV32I-NEXT:    snez a3, a0
 ; RV32I-NEXT:    neg a4, a1
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    sw a1, 4(a2)
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    sub a1, a4, a3
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: neg_abs64_multiuse:
@@ -229,10 +228,9 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32ZBB-NEXT:    sw a0, 0(a2)
 ; RV32ZBB-NEXT:    snez a3, a0
 ; RV32ZBB-NEXT:    neg a4, a1
-; RV32ZBB-NEXT:    sub a3, a4, a3
-; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    sw a1, 4(a2)
-; RV32ZBB-NEXT:    mv a1, a3
+; RV32ZBB-NEXT:    sub a1, a4, a3
+; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: neg_abs64_multiuse:
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 4c5c36fc72d14d..4d2aa667a402fb 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -16,10 +16,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) {
 ; CHECK-RV32-LABEL: test_nontemporal_load_i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    lw a2, 0(a0)
-; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    lw a1, 4(a0)
-; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_load_i64:
@@ -31,10 +30,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) {
 ; CHECK-RV32C-LABEL: test_nontemporal_load_i64:
 ; CHECK-RV32C:       # %bb.0:
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    lw a2, 0(a0)
-; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    lw a1, 4(a0)
-; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
 ;
 ; CHECK-RV64V-LABEL: test_nontemporal_load_i64:
@@ -46,10 +44,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) {
 ; CHECK-RV32V-LABEL: test_nontemporal_load_i64:
 ; CHECK-RV32V:       # %bb.0:
 ; CHECK-RV32V-NEXT:    ntl.all
-; CHECK-RV32V-NEXT:    lw a2, 0(a0)
-; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    lw a1, 4(a0)
-; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32V-NEXT:    ret
 
   %1 = load i64, ptr %p, !nontemporal !0
@@ -540,10 +537,9 @@ define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) {
 ; CHECK-RV64-LABEL: test_nontemporal_load_v2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    ld a2, 0(a0)
-; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    ld a1, 8(a0)
-; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_load_v2i64:
@@ -565,10 +561,9 @@ define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) {
 ; CHECK-RV64C-LABEL: test_nontemporal_load_v2i64:
 ; CHECK-RV64C:       # %bb.0:
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    ld a2, 0(a0)
-; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    ld a1, 8(a0)
-; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_load_v2i64:
@@ -1448,10 +1443,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) {
 ; CHECK-RV32-LABEL: test_nontemporal_P1_load_i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    lw a2, 0(a0)
-; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    lw a1, 4(a0)
-; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i64:
@@ -1463,10 +1457,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) {
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i64:
 ; CHECK-RV32C:       # %bb.0:
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    lw a2, 0(a0)
-; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    lw a1, 4(a0)
-; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
 ;
 ; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i64:
@@ -1478,10 +1471,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) {
 ; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i64:
 ; CHECK-RV32V:       # %bb.0:
 ; CHECK-RV32V-NEXT:    ntl.p1
-; CHECK-RV32V-NEXT:    lw a2, 0(a0)
-; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    lw a1, 4(a0)
-; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32V-NEXT:    ret
   %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
   ret i64 %1
@@ -1962,10 +1954,9 @@ define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) {
 ; CHECK-RV64-LABEL: test_nontemporal_P1_load_v2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    ld a2, 0(a0)
-; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    ld a1, 8(a0)
-; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_P1_load_v2i64:
@@ -1987,10 +1978,9 @@ define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) {
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v2i64:
 ; CHECK-RV64C:       # %bb.0:
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    ld a2, 0(a0)
-; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    ld a1, 8(a0)
-; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v2i64:
@@ -2862,10 +2852,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) {
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    lw a2, 0(a0)
-; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    lw a1, 4(a0)
-; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i64:
@@ -2877,10 +2866,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) {
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i64:
 ; CHECK-RV32C:       # %bb.0:
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    lw a2, 0(a0)
-; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    lw a1, 4(a0)
-; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
 ;
 ; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i64:
@@ -2892,10 +2880,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) {
 ; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i64:
 ; CHECK-RV32V:       # %bb.0:
 ; CHECK-RV32V-NEXT:    ntl.pall
-; CHECK-RV32V-NEXT:    lw a2, 0(a0)
-; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    lw a1, 4(a0)
-; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32V-NEXT:    ret
   %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
   ret i64 %1
@@ -3376,10 +3363,9 @@ define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) {
 ; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    ld a2, 0(a0)
-; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    ld a1, 8(a0)
-; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v2i64:
@@ -3401,10 +3387,9 @@ define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) {
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v2i64:
 ; CHECK-RV64C:       # %bb.0:
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    ld a2, 0(a0)
-; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    ld a1, 8(a0)
-; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v2i64:
@@ -4276,10 +4261,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) {
 ; CHECK-RV32-LABEL: test_nontemporal_S1_load_i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    lw a2, 0(a0)
-; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    lw a1, 4(a0)
-; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i64:
@@ -4291,10 +4275,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) {
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i64:
 ; CHECK-RV32C:       # %bb.0:
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    lw a2, 0(a0)
-; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    lw a1, 4(a0)
-; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
 ;
 ; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i64:
@@ -4306,10 +4289,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) {
 ; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i64:
 ; CHECK-RV32V:       # %bb.0:
 ; CHECK-RV32V-NEXT:    ntl.s1
-; CHECK-RV32V-NEXT:    lw a2, 0(a0)
-; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    lw a1, 4(a0)
-; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32V-NEXT:    ret
   %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
   ret i64 %1
@@ -4790,10 +4772,9 @@ define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) {
 ; CHECK-RV64-LABEL: test_nontemporal_S1_load_v2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    ld a2, 0(a0)
-; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    ld a1, 8(a0)
-; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_S1_load_v2i64:
@@ -4815,10 +4796,9 @@ define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) {
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v2i64:
 ; CHECK-RV64C:       # %bb.0:
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    ld a2, 0(a0)
-; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    ld a1, 8(a0)
-; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v2i64:
@@ -5690,10 +5670,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) {
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    lw a2, 0(a0)
-; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    lw a1, 4(a0)
-; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i64:
@@ -5705,10 +5684,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) {
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i64:
 ; CHECK-RV32C:       # %bb.0:
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    lw a2, 0(a0)
-; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    lw a1, 4(a0)
-; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
 ;
 ; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i64:
@@ -5720,10 +5698,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) {
 ; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i64:
 ; CHECK-RV32V:       # %bb.0:
 ; CHECK-RV32V-NEXT:    ntl.all
-; CHECK-RV32V-NEXT:    lw a2, 0(a0)
-; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    lw a1, 4(a0)
-; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
 ; CHECK-RV32V-NEXT:    ret
   %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
   ret i64 %1
@@ -6204,10 +6181,9 @@ define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) {
 ; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    ld a2, 0(a0)
-; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    ld a1, 8(a0)
-; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v2i64:
@@ -6229,10 +6205,9 @@ define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) {
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v2i64:
 ; CHECK-RV64C:       # %bb.0:
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    ld a2, 0(a0)
-; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    ld a1, 8(a0)
-; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v2i64:
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 4bb65f376218f1..67143336de477b 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -686,10 +686,10 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
 ; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
 ; RV32-NEXT:    mv a0, a3
@@ -714,10 +714,10 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt_dom:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
 ; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    sw a0, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
 ; RV32-NEXT:    mv a0, a3
@@ -830,10 +830,9 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a3, a1
 ; RV64-NEXT:    sext.w a4, a0
-; RV64-NEXT:    sltu a3, a4, a3
 ; RV64-NEXT:    subw a0, a0, a1
 ; RV64-NEXT:    sw a0, 0(a2)
-; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:    sltu a0, a4, a3
 ; RV64-NEXT:    ret
   %ov = icmp ugt i32 %y, %x
   %s = sub i32 %x, %y
@@ -929,19 +928,17 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) {
 ; RV32-LABEL: usubo_ugt_constant_op1_i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    andi a2, a0, 255
-; RV32-NEXT:    sltiu a2, a2, 45
 ; RV32-NEXT:    addi a0, a0, -45
 ; RV32-NEXT:    sb a0, 0(a1)
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    sltiu a0, a2, 45
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: usubo_ugt_constant_op1_i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    andi a2, a0, 255
-; RV64-NEXT:    sltiu a2, a2, 45
 ; RV64-NEXT:    addi a0, a0, -45
 ; RV64-NEXT:    sb a0, 0(a1)
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    sltiu a0, a2, 45
 ; RV64-NEXT:    ret
   %ov = icmp ugt i8 45, %x
   %s = add i8 %x, -45
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
index 4e958f5699adbf..5fe5a0eda46b84 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
@@ -153,10 +153,10 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    not a6, a2
 ; CHECK-NEXT:    srl a3, a1, a6
-; CHECK-NEXT:    or a3, a5, a3
 ; CHECK-NEXT:    sll a0, a0, a2
 ; CHECK-NEXT:    srli a4, a4, 1
 ; CHECK-NEXT:    srl a1, a4, a6
+; CHECK-NEXT:    or a3, a5, a3
 ; CHECK-NEXT:    or a1, a0, a1
 ; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:    ret
@@ -252,11 +252,10 @@ define i64 @rori_i64(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    slli a3, a1, 31
-; CHECK-NEXT:    or a2, a3, a2
 ; CHECK-NEXT:    srli a1, a1, 1
 ; CHECK-NEXT:    slli a0, a0, 31
 ; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    or a0, a3, a2
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
@@ -267,11 +266,10 @@ define i64 @rori_i64_fshr(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a1, 31
 ; CHECK-NEXT:    slli a3, a0, 1
-; CHECK-NEXT:    or a2, a3, a2
 ; CHECK-NEXT:    srli a0, a0, 31
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    or a1, a1, a0
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    or a0, a3, a2
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
new file mode 100644
index 00000000000000..15aa11670e1263
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
@@ -0,0 +1,2603 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZBA
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZICOND
+
+;
+; Get the actual value of the overflow bit.
+;
+define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: saddo1.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a3, a0, a1
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    xor a3, a1, a3
+; RV64-NEXT:    snez a0, a3
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo1.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a3, a0, a1
+; RV64ZBA-NEXT:    add a1, a0, a1
+; RV64ZBA-NEXT:    xor a3, a1, a3
+; RV64ZBA-NEXT:    snez a0, a3
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo1.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a3, a0, a1
+; RV64ZICOND-NEXT:    add a1, a0, a1
+; RV64ZICOND-NEXT:    xor a3, a1, a3
+; RV64ZICOND-NEXT:    snez a0, a3
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test the immediate version.
+define zeroext i1 @saddo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo2.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, 4
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo2.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, 4
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test negative immediates.
+define zeroext i1 @saddo3.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo3.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, -4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo3.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, -4
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo3.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, -4
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test immediates that are too large to be encoded.
+define zeroext i1 @saddo4.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo4.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    addw a2, a0, a2
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo4.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    lui a2, 4096
+; RV64ZBA-NEXT:    addi a2, a2, -1
+; RV64ZBA-NEXT:    addw a2, a0, a2
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo4.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    lui a2, 4096
+; RV64ZICOND-NEXT:    addi a2, a2, -1
+; RV64ZICOND-NEXT:    addw a2, a0, a2
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: saddo1.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    slt a0, a3, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    sd a3, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo1.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a3, a0, a1
+; RV64ZBA-NEXT:    slt a0, a3, a0
+; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    xor a0, a1, a0
+; RV64ZBA-NEXT:    sd a3, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo1.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a3, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a3, a0
+; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    xor a0, a1, a0
+; RV64ZICOND-NEXT:    sd a3, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: saddo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo2.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, 4
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo2.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, 4
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: saddo3.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, -4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo3.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, -4
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo3.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, -4
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: uaddo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
+; RV64-LABEL: uaddo.i32.constant:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, -2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i32.constant:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, -2
+; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i32.constant:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, -2
+; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32.constant_one(i32 signext %v1, ptr %res) {
+; RV64-LABEL: uaddo.i32.constant_one:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 1
+; RV64-NEXT:    seqz a0, a2
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i32.constant_one:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, 1
+; RV64ZBA-NEXT:    seqz a0, a2
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i32.constant_one:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, 1
+; RV64ZICOND-NEXT:    seqz a0, a2
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: uaddo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
+; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_one(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_one:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 1
+; RV64-NEXT:    seqz a0, a2
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64.constant_one:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, 1
+; RV64ZBA-NEXT:    seqz a0, a2
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64.constant_one:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, 1
+; RV64ZICOND-NEXT:    seqz a0, a2
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: ssubo1.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a3, a0, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    xor a3, a1, a3
+; RV64-NEXT:    snez a0, a3
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo1.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a3, a0, a1
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    xor a3, a1, a3
+; RV64ZBA-NEXT:    snez a0, a3
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo1.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a3, a0, a1
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    xor a3, a1, a3
+; RV64ZICOND-NEXT:    snez a0, a3
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: ssubo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo2.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, 4
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo2.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, 4
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: ssubo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a3, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sgtz a3, a1
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    xor a0, a3, a0
+; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sgtz a3, a1
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    xor a0, a3, a0
+; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: usubo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
+; RV64-LABEL: usubo.i32.constant.rhs:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 2
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.i32.constant.rhs:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addiw a2, a0, 2
+; RV64ZBA-NEXT:    sltu a0, a0, a2
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addiw a2, a0, 2
+; RV64ZICOND-NEXT:    sltu a0, a0, a2
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
+; RV64-LABEL: usubo.i32.constant.lhs:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, -2
+; RV64-NEXT:    subw a2, a2, a0
+; RV64-NEXT:    addi a0, a2, 1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.i32.constant.lhs:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    li a2, -2
+; RV64ZBA-NEXT:    subw a2, a2, a0
+; RV64ZBA-NEXT:    addi a0, a2, 1
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.i32.constant.lhs:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a2, -2
+; RV64ZICOND-NEXT:    subw a2, a2, a0
+; RV64ZICOND-NEXT:    addi a0, a2, 1
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: usubo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: smulo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulw a3, a0, a1
+; RV64-NEXT:    mul a1, a0, a1
+; RV64-NEXT:    xor a3, a1, a3
+; RV64-NEXT:    snez a0, a3
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulw a3, a0, a1
+; RV64ZBA-NEXT:    mul a1, a0, a1
+; RV64ZBA-NEXT:    xor a3, a1, a3
+; RV64ZBA-NEXT:    snez a0, a3
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulw a3, a0, a1
+; RV64ZICOND-NEXT:    mul a1, a0, a1
+; RV64ZICOND-NEXT:    xor a3, a1, a3
+; RV64ZICOND-NEXT:    snez a0, a3
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: smulo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mulw a3, a0, a2
+; RV64-NEXT:    mul a2, a0, a2
+; RV64-NEXT:    xor a3, a2, a3
+; RV64-NEXT:    snez a0, a3
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo2.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a2, a2, a0
+; RV64ZBA-NEXT:    sext.w a0, a2
+; RV64ZBA-NEXT:    xor a0, a2, a0
+; RV64ZBA-NEXT:    snez a0, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo2.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a2, 13
+; RV64ZICOND-NEXT:    mulw a3, a0, a2
+; RV64ZICOND-NEXT:    mul a2, a0, a2
+; RV64ZICOND-NEXT:    xor a3, a2, a3
+; RV64ZICOND-NEXT:    snez a0, a3
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: smulo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a3, a0, a1
+; RV64-NEXT:    mul a1, a0, a1
+; RV64-NEXT:    srai a0, a1, 63
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulh a3, a0, a1
+; RV64ZBA-NEXT:    mul a1, a0, a1
+; RV64ZBA-NEXT:    srai a0, a1, 63
+; RV64ZBA-NEXT:    xor a0, a3, a0
+; RV64ZBA-NEXT:    snez a0, a0
+; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulh a3, a0, a1
+; RV64ZICOND-NEXT:    mul a1, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a1, 63
+; RV64ZICOND-NEXT:    xor a0, a3, a0
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: smulo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mulh a3, a0, a2
+; RV64-NEXT:    mul a2, a0, a2
+; RV64-NEXT:    srai a0, a2, 63
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo2.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    li a2, 13
+; RV64ZBA-NEXT:    mulh a2, a0, a2
+; RV64ZBA-NEXT:    sh1add a3, a0, a0
+; RV64ZBA-NEXT:    sh2add a3, a3, a0
+; RV64ZBA-NEXT:    srai a0, a3, 63
+; RV64ZBA-NEXT:    xor a0, a2, a0
+; RV64ZBA-NEXT:    snez a0, a0
+; RV64ZBA-NEXT:    sd a3, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo2.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a2, 13
+; RV64ZICOND-NEXT:    mulh a3, a0, a2
+; RV64ZICOND-NEXT:    mul a2, a0, a2
+; RV64ZICOND-NEXT:    srai a0, a2, 63
+; RV64ZICOND-NEXT:    xor a0, a3, a0
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: umulo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a1, a0, a1
+; RV64-NEXT:    srai a0, a1, 32
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    mul a1, a0, a1
+; RV64ZBA-NEXT:    srai a0, a1, 32
+; RV64ZBA-NEXT:    snez a0, a0
+; RV64ZBA-NEXT:    sw a1, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    slli a1, a1, 32
+; RV64ZICOND-NEXT:    slli a0, a0, 32
+; RV64ZICOND-NEXT:    mulhu a1, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a1, 32
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: umulo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a2, a0, a2
+; RV64-NEXT:    srli a0, a2, 32
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo2.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    zext.w a2, a0
+; RV64ZBA-NEXT:    sh1add.uw a0, a0, a2
+; RV64ZBA-NEXT:    sh2add a2, a0, a2
+; RV64ZBA-NEXT:    srli a0, a2, 32
+; RV64ZBA-NEXT:    snez a0, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo2.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a2, 13
+; RV64ZICOND-NEXT:    slli a2, a2, 32
+; RV64ZICOND-NEXT:    slli a0, a0, 32
+; RV64ZICOND-NEXT:    mulhu a2, a0, a2
+; RV64ZICOND-NEXT:    srli a0, a2, 32
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    sw a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Similar to umulo.i32, but storing the overflow and returning the result.
+define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
+; RV64-LABEL: umulo3.i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a1, a0, 32
+; RV64-NEXT:    snez a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo3.i32:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    mul a3, a0, a1
+; RV64ZBA-NEXT:    srai a3, a3, 32
+; RV64ZBA-NEXT:    snez a3, a3
+; RV64ZBA-NEXT:    mulw a0, a0, a1
+; RV64ZBA-NEXT:    sw a3, 0(a2)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo3.i32:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    slli a1, a1, 32
+; RV64ZICOND-NEXT:    slli a0, a0, 32
+; RV64ZICOND-NEXT:    mulhu a0, a0, a1
+; RV64ZICOND-NEXT:    srai a1, a0, 32
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    sext.w a0, a0
+; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    ret
+  %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1)
+  %5 = extractvalue { i32, i1 } %4, 1
+  %6 = extractvalue { i32, i1 } %4, 0
+  %7 = zext i1 %5 to i32
+  store i32 %7, ptr %2, align 4
+  ret i32 %6
+}
+
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: umulo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a3, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    snez a0, a3
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulhu a3, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    sd a0, 0(a2)
+; RV64ZBA-NEXT:    snez a0, a3
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulhu a3, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    sd a0, 0(a2)
+; RV64ZICOND-NEXT:    snez a0, a3
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: umulo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    mulhu a2, a0, a3
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    sd a0, 0(a1)
+; RV64-NEXT:    snez a0, a2
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo2.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    li a2, 13
+; RV64ZBA-NEXT:    mulhu a2, a0, a2
+; RV64ZBA-NEXT:    sh1add a3, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a3, a0
+; RV64ZBA-NEXT:    sd a0, 0(a1)
+; RV64ZBA-NEXT:    snez a0, a2
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo2.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a3, 13
+; RV64ZICOND-NEXT:    mulhu a2, a0, a3
+; RV64ZICOND-NEXT:    mul a0, a0, a3
+; RV64ZICOND-NEXT:    sd a0, 0(a1)
+; RV64ZICOND-NEXT:    snez a0, a2
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    bne a3, a2, .LBB28_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB28_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a2, a0, a1
+; RV64ZBA-NEXT:    add a3, a0, a1
+; RV64ZBA-NEXT:    bne a3, a2, .LBB28_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB28_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a2, a0, a1
+; RV64ZICOND-NEXT:    add a3, a0, a1
+; RV64ZICOND-NEXT:    xor a2, a3, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a2, a0, a1
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    xor a0, a0, a2
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a2, a0, a1
+; RV64ZICOND-NEXT:    add a0, a0, a1
+; RV64ZICOND-NEXT:    xor a0, a0, a2
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a2, a2, a0
+; RV64-NEXT:    slti a3, a1, 0
+; RV64-NEXT:    bne a3, a2, .LBB30_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB30_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a2, a0, a1
+; RV64ZBA-NEXT:    slt a2, a2, a0
+; RV64ZBA-NEXT:    slti a3, a1, 0
+; RV64ZBA-NEXT:    bne a3, a2, .LBB30_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB30_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a2, a0, a1
+; RV64ZICOND-NEXT:    slt a2, a2, a0
+; RV64ZICOND-NEXT:    slti a3, a1, 0
+; RV64ZICOND-NEXT:    xor a2, a3, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a2, a0, a1
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    xor a0, a1, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a2, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    xor a0, a1, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: uaddo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    bltu a2, a0, .LBB32_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB32_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a2, a0, a1
+; RV64ZBA-NEXT:    bltu a2, a0, .LBB32_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB32_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a2, a0, a1
+; RV64ZICOND-NEXT:    sltu a2, a2, a0
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: uaddo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    bltu a2, a0, .LBB34_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB34_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a2, a0, a1
+; RV64ZBA-NEXT:    bltu a2, a0, .LBB34_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB34_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a2, a0, a1
+; RV64ZICOND-NEXT:    sltu a2, a2, a0
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a2, a0, a1
+; RV64-NEXT:    sub a3, a0, a1
+; RV64-NEXT:    bne a3, a2, .LBB36_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB36_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a2, a0, a1
+; RV64ZBA-NEXT:    sub a3, a0, a1
+; RV64ZBA-NEXT:    bne a3, a2, .LBB36_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB36_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a2, a0, a1
+; RV64ZICOND-NEXT:    sub a3, a0, a1
+; RV64ZICOND-NEXT:    xor a2, a3, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a2, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a2, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    xor a0, a0, a2
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a2, a0, a1
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    xor a0, a0, a2
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssubo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a3, a0, a1
+; RV64-NEXT:    slt a3, a3, a0
+; RV64-NEXT:    bne a2, a3, .LBB38_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB38_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sgtz a2, a1
+; RV64ZBA-NEXT:    sub a3, a0, a1
+; RV64ZBA-NEXT:    slt a3, a3, a0
+; RV64ZBA-NEXT:    bne a2, a3, .LBB38_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB38_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sgtz a2, a1
+; RV64ZICOND-NEXT:    sub a3, a0, a1
+; RV64ZICOND-NEXT:    slt a3, a3, a0
+; RV64ZICOND-NEXT:    xor a2, a2, a3
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssub.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssub.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sgtz a2, a1
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    xor a0, a2, a0
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssub.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sgtz a2, a1
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    xor a0, a2, a0
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a2, a0, a1
+; RV64-NEXT:    bltu a0, a2, .LBB40_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB40_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a2, a0, a1
+; RV64ZBA-NEXT:    bltu a0, a2, .LBB40_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB40_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a2, a0, a1
+; RV64ZICOND-NEXT:    sltu a2, a0, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a2, a0, a1
+; RV64-NEXT:    bltu a0, a2, .LBB42_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB42_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sub a2, a0, a1
+; RV64ZBA-NEXT:    bltu a0, a2, .LBB42_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB42_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sub a2, a0, a1
+; RV64ZICOND-NEXT:    sltu a2, a0, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    xori a0, a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @smulo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulw a2, a0, a1
+; RV64-NEXT:    mul a3, a0, a1
+; RV64-NEXT:    bne a3, a2, .LBB44_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB44_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulw a2, a0, a1
+; RV64ZBA-NEXT:    mul a3, a0, a1
+; RV64ZBA-NEXT:    bne a3, a2, .LBB44_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB44_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulw a2, a0, a1
+; RV64ZICOND-NEXT:    mul a3, a0, a1
+; RV64ZICOND-NEXT:    xor a2, a3, a2
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulw a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulw a2, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    xor a0, a0, a2
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulw a2, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    xor a0, a0, a2
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a3, a0, a1
+; RV64-NEXT:    srai a3, a3, 63
+; RV64-NEXT:    bne a2, a3, .LBB46_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB46_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulh a2, a0, a1
+; RV64ZBA-NEXT:    mul a3, a0, a1
+; RV64ZBA-NEXT:    srai a3, a3, 63
+; RV64ZBA-NEXT:    bne a2, a3, .LBB46_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB46_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulh a2, a0, a1
+; RV64ZICOND-NEXT:    mul a3, a0, a1
+; RV64ZICOND-NEXT:    srai a3, a3, 63
+; RV64ZICOND-NEXT:    xor a2, a2, a3
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulh a2, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    srai a0, a0, 63
+; RV64ZBA-NEXT:    xor a0, a2, a0
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulh a2, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a0, 63
+; RV64ZICOND-NEXT:    xor a0, a2, a0
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @umulo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    mulhu a2, a3, a2
+; RV64-NEXT:    srai a2, a2, 32
+; RV64-NEXT:    bnez a2, .LBB48_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB48_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.select.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    zext.w a2, a1
+; RV64ZBA-NEXT:    zext.w a3, a0
+; RV64ZBA-NEXT:    mul a2, a3, a2
+; RV64ZBA-NEXT:    srai a2, a2, 32
+; RV64ZBA-NEXT:    bnez a2, .LBB48_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB48_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.select.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    slli a2, a1, 32
+; RV64ZICOND-NEXT:    slli a3, a0, 32
+; RV64ZICOND-NEXT:    mulhu a2, a3, a2
+; RV64ZICOND-NEXT:    srai a2, a2, 32
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 32
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.not.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    srai a0, a0, 32
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.not.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    slli a1, a1, 32
+; RV64ZICOND-NEXT:    slli a0, a0, 32
+; RV64ZICOND-NEXT:    mulhu a0, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a0, 32
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a2, a0, a1
+; RV64-NEXT:    bnez a2, .LBB50_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB50_2: # %entry
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.select.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulhu a2, a0, a1
+; RV64ZBA-NEXT:    bnez a2, .LBB50_2
+; RV64ZBA-NEXT:  # %bb.1: # %entry
+; RV64ZBA-NEXT:    mv a0, a1
+; RV64ZBA-NEXT:  .LBB50_2: # %entry
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.select.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulhu a2, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.not.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulhu a0, a0, a1
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.not.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulhu a0, a0, a1
+; RV64ZICOND-NEXT:    seqz a0, a0
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    beq a0, a2, .LBB52_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB52_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a2, a0, a1
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    beq a0, a2, .LBB52_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB52_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a2, a0, a1
+; RV64ZICOND-NEXT:    add a0, a0, a1
+; RV64ZICOND-NEXT:    beq a0, a2, .LBB52_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB52_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    beq a1, a0, .LBB53_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB53_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: saddo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a2, a0, a1
+; RV64ZBA-NEXT:    slt a0, a2, a0
+; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    beq a1, a0, .LBB53_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB53_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: saddo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a2, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a2, a0
+; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    beq a1, a0, .LBB53_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB53_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+; RV64-LABEL: uaddo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bgeu a1, a0, .LBB54_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB54_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addw a1, a0, a1
+; RV64ZBA-NEXT:    sext.w a0, a0
+; RV64ZBA-NEXT:    bgeu a1, a0, .LBB54_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB54_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addw a1, a0, a1
+; RV64ZICOND-NEXT:    sext.w a0, a0
+; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB54_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB54_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB55_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a1, a0, a1
+; RV64ZBA-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB55_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a1, a0, a1
+; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB55_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a2, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    beq a0, a2, .LBB56_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB56_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a2, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    beq a0, a2, .LBB56_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB56_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a2, a0, a1
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    beq a0, a2, .LBB56_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB56_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssubo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    beq a2, a0, .LBB57_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB57_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: ssubo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sgtz a2, a1
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    slt a0, a1, a0
+; RV64ZBA-NEXT:    beq a2, a0, .LBB57_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB57_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: ssubo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sgtz a2, a1
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    slt a0, a1, a0
+; RV64ZICOND-NEXT:    beq a2, a0, .LBB57_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB57_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    bgeu a0, a1, .LBB58_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB58_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    subw a1, a0, a1
+; RV64ZBA-NEXT:    bgeu a0, a1, .LBB58_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB58_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    subw a1, a0, a1
+; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB58_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB58_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB59_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: usubo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sub a1, a0, a1
+; RV64ZBA-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB59_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: usubo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    sub a1, a0, a1
+; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB59_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulw a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    beq a0, a2, .LBB60_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB60_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulw a2, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    beq a0, a2, .LBB60_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB60_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulw a2, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    beq a0, a2, .LBB60_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB60_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    beq a2, a0, .LBB61_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB61_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulh a2, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    srai a0, a0, 63
+; RV64ZBA-NEXT:    beq a2, a0, .LBB61_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB61_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulh a2, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a0, 63
+; RV64ZICOND-NEXT:    beq a2, a0, .LBB61_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB61_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo2.br.i64(i64 %v1) {
+; RV64-LABEL: smulo2.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, -13
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    beq a2, a0, .LBB62_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB62_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: smulo2.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    li a1, -13
+; RV64ZBA-NEXT:    mulh a2, a0, a1
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    srai a0, a0, 63
+; RV64ZBA-NEXT:    beq a2, a0, .LBB62_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB62_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: smulo2.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    li a1, -13
+; RV64ZICOND-NEXT:    mulh a2, a0, a1
+; RV64ZICOND-NEXT:    mul a0, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a0, 63
+; RV64ZICOND-NEXT:    beq a2, a0, .LBB62_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB62_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 32
+; RV64-NEXT:    beqz a0, .LBB63_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB63_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.br.i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    mul a0, a0, a1
+; RV64ZBA-NEXT:    srai a0, a0, 32
+; RV64ZBA-NEXT:    beqz a0, .LBB63_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB63_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.br.i32:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    slli a1, a1, 32
+; RV64ZICOND-NEXT:    slli a0, a0, 32
+; RV64ZICOND-NEXT:    mulhu a0, a0, a1
+; RV64ZICOND-NEXT:    srai a0, a0, 32
+; RV64ZICOND-NEXT:    beqz a0, .LBB63_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB63_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    beqz a0, .LBB64_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB64_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    mulhu a0, a0, a1
+; RV64ZBA-NEXT:    beqz a0, .LBB64_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB64_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    mulhu a0, a0, a1
+; RV64ZICOND-NEXT:    beqz a0, .LBB64_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB64_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo2.br.i64(i64 %v1) {
+; RV64-LABEL: umulo2.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a0
+; RV64-NEXT:    bgeu a1, a0, .LBB65_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB65_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: umulo2.br.i64:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    add a1, a0, a0
+; RV64ZBA-NEXT:    bgeu a1, a0, .LBB65_2
+; RV64ZBA-NEXT:  # %bb.1: # %overflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:    ret
+; RV64ZBA-NEXT:  .LBB65_2: # %continue
+; RV64ZBA-NEXT:    li a0, 1
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: umulo2.br.i64:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    add a1, a0, a0
+; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB65_2
+; RV64ZICOND-NEXT:  # %bb.1: # %overflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:    ret
+; RV64ZICOND-NEXT:  .LBB65_2: # %continue
+; RV64ZICOND-NEXT:    li a0, 1
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64.constant:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, 2
+; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64.constant:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, 2
+; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_2048:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2047
+; RV64-NEXT:    addi a2, a2, 1
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64.constant_2048:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, 2047
+; RV64ZBA-NEXT:    addi a2, a2, 1
+; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64.constant_2048:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, 2047
+; RV64ZICOND-NEXT:    addi a2, a2, 1
+; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2048)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_2049:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2047
+; RV64-NEXT:    addi a2, a2, 2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64.constant_2049:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    addi a2, a0, 2047
+; RV64ZBA-NEXT:    addi a2, a2, 2
+; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64.constant_2049:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    addi a2, a0, 2047
+; RV64ZICOND-NEXT:    addi a2, a2, 2
+; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
+; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    addi a0, a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB69_2
+; RV64-NEXT:  # %bb.1: # %IfOverflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:  .LBB69_2: # %IfNoOverflow
+; RV64-NEXT:    ret
+;
+; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    ld a1, 0(a0)
+; RV64ZBA-NEXT:    addi a0, a1, 2
+; RV64ZBA-NEXT:    bltu a0, a1, .LBB69_2
+; RV64ZBA-NEXT:  # %bb.1: # %IfOverflow
+; RV64ZBA-NEXT:    li a0, 0
+; RV64ZBA-NEXT:  .LBB69_2: # %IfNoOverflow
+; RV64ZBA-NEXT:    ret
+;
+; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    ld a1, 0(a0)
+; RV64ZICOND-NEXT:    addi a0, a1, 2
+; RV64ZICOND-NEXT:    bltu a0, a1, .LBB69_2
+; RV64ZICOND-NEXT:  # %bb.1: # %IfOverflow
+; RV64ZICOND-NEXT:    li a0, 0
+; RV64ZICOND-NEXT:  .LBB69_2: # %IfNoOverflow
+; RV64ZICOND-NEXT:    ret
+entry:
+  %v1 = load i64, ptr %p
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %IfNoOverflow, label %IfOverflow
+IfOverflow:
+  ret i64 0
+IfNoOverflow:
+  ret i64 %val
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll b/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll
index 2fa344d4d79a7f..15c1f4325f117e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll
@@ -186,9 +186,8 @@ define i1 @test_cross_bb(ptr addrspace(1) %a, i1 %external_cond) gc "statepoint-
 ; CHECK-NEXT:  .Ltmp8:
 ; CHECK-NEXT:    beqz s0, .LBB8_2
 ; CHECK-NEXT:  # %bb.1: # %left
-; CHECK-NEXT:    ld a1, 8(sp)
 ; CHECK-NEXT:    mv s0, a0
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    ld a0, 8(sp)
 ; CHECK-NEXT:    call consume
 ; CHECK-NEXT:    mv a0, s0
 ; CHECK-NEXT:    j .LBB8_3
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index 7839b602706db1..3c0888fa170361 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -23,9 +23,8 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vmsne.vi v10, v10, 0
 ; RV32-NEXT:    vmv1r.v v11, v0
-; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -47,9 +46,8 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    vmsne.vi v12, v12, 0
 ; RV64-NEXT:    vmv1r.v v13, v0
-; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vmsne.vi v0, v12, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV64-NEXT:    vmv.x.s a0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 9f8de22b25c2df..27a5773e64043a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -26,10 +26,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
 ; CHECK-NEXT:    vrgather.vv v9, v8, v12, v0.t
-; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vrgather.vv v13, v10, v12
 ; CHECK-NEXT:    vadd.vi v10, v11, -15
+; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10, v0.t
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
 ; CHECK-NEXT:    vmv.v.v v0, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 51eb63f5f92212..6acda17e412b30 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -52,9 +52,8 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -68,9 +67,8 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
@@ -124,9 +122,8 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -140,9 +137,8 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
@@ -198,9 +194,8 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -214,8 +209,8 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
@@ -274,8 +269,8 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v14
@@ -290,8 +285,8 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
@@ -326,9 +321,8 @@ define <2 x float> @vfmax_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -360,9 +354,8 @@ define <4 x float> @vfmax_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -396,8 +389,8 @@ define <8 x float> @vfmax_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -432,8 +425,8 @@ define <16 x float> @vfmax_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -466,9 +459,8 @@ define <2 x double> @vfmax_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -502,8 +494,8 @@ define <4 x double> @vfmax_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -538,8 +530,8 @@ define <8 x double> @vfmax_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -587,8 +579,8 @@ define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
@@ -767,8 +759,8 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 02c2fafc89785f..d60a4ff4391d71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -15,9 +15,8 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -29,9 +28,8 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -48,9 +46,8 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -62,9 +59,8 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -81,9 +77,8 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -95,8 +90,8 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
@@ -114,8 +109,8 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v14
@@ -128,8 +123,8 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
@@ -147,9 +142,8 @@ define <2 x float> @vfmax_v2f32_vv(<2 x float> %a, <2 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -164,9 +158,8 @@ define <4 x float> @vfmax_v4f32_vv(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -181,8 +174,8 @@ define <8 x float> @vfmax_v8f32_vv(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -198,8 +191,8 @@ define <16 x float> @vfmax_v16f32_vv(<16 x float> %a, <16 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -215,9 +208,8 @@ define <2 x double> @vfmax_v2f64_vv(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -232,8 +224,8 @@ define <4 x double> @vfmax_v4f64_vv(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -249,8 +241,8 @@ define <8 x double> @vfmax_v8f64_vv(<8 x double> %a, <8 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -266,8 +258,8 @@ define <16 x double> @vfmax_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
@@ -304,9 +296,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfadd.vv v10, v8, v8
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFH-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v8
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v11, v8
 ; ZVFH-NEXT:    ret
@@ -325,9 +316,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -345,9 +335,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfadd.vv v10, v9, v9
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v9, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v9
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -366,9 +355,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 03e0ac42c442c4..5f785612247728 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -52,9 +52,8 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -68,9 +67,8 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
@@ -124,9 +122,8 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -140,9 +137,8 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
@@ -198,9 +194,8 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -214,8 +209,8 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
@@ -274,8 +269,8 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v14
@@ -290,8 +285,8 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
@@ -326,9 +321,8 @@ define <2 x float> @vfmin_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -360,9 +354,8 @@ define <4 x float> @vfmin_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -396,8 +389,8 @@ define <8 x float> @vfmin_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -432,8 +425,8 @@ define <16 x float> @vfmin_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -466,9 +459,8 @@ define <2 x double> @vfmin_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -502,8 +494,8 @@ define <4 x double> @vfmin_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -538,8 +530,8 @@ define <8 x double> @vfmin_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -587,8 +579,8 @@ define <16 x double> @vfmin_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
@@ -767,8 +759,8 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index b15d697f0754ed..991fe821a8421b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -15,9 +15,8 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -29,9 +28,8 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -48,9 +46,8 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -62,9 +59,8 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -81,9 +77,8 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -95,8 +90,8 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
@@ -114,8 +109,8 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v14
@@ -128,8 +123,8 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
@@ -147,9 +142,8 @@ define <2 x float> @vfmin_v2f32_vv(<2 x float> %a, <2 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -164,9 +158,8 @@ define <4 x float> @vfmin_v4f32_vv(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -181,8 +174,8 @@ define <8 x float> @vfmin_v8f32_vv(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -198,8 +191,8 @@ define <16 x float> @vfmin_v16f32_vv(<16 x float> %a, <16 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -215,9 +208,8 @@ define <2 x double> @vfmin_v2f64_vv(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -232,8 +224,8 @@ define <4 x double> @vfmin_v4f64_vv(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -249,8 +241,8 @@ define <8 x double> @vfmin_v8f64_vv(<8 x double> %a, <8 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -266,8 +258,8 @@ define <16 x double> @vfmin_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
@@ -304,9 +296,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfadd.vv v10, v8, v8
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFH-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v8
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v11, v8
 ; ZVFH-NEXT:    ret
@@ -325,9 +316,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -345,9 +335,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfadd.vv v10, v9, v9
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v9, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT:    vmv1r.v v0, v9
+; ZVFH-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -366,9 +355,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index a1e81ea41c249b..07d8c804293a44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -388,8 +388,8 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vmfeq.vf v8, v16, fa5
 ; RV32-NEXT:    vse64.v v24, (a1), v0.t
+; RV32-NEXT:    vmfeq.vf v8, v16, fa5
 ; RV32-NEXT:    addi a0, a1, 128
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    addi a1, sp, 16
@@ -428,8 +428,8 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmfeq.vf v8, v16, fa5
 ; RV64-NEXT:    vse64.v v24, (a1), v0.t
+; RV64-NEXT:    vmfeq.vf v8, v16, fa5
 ; RV64-NEXT:    addi a0, a1, 128
 ; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    addi a1, sp, 16
@@ -496,8 +496,8 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
 ; CHECK-NEXT:    vse32.v v24, (a1), v0.t
+; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    addi a1, sp, 16
@@ -545,8 +545,8 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
 ; CHECK-NEXT:    vse16.v v24, (a1), v0.t
+; CHECK-NEXT:    vmfeq.vf v8, v16, fa5
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    addi a1, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index 86c28247e97ef1..395af121b3c6b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -466,8 +466,8 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vmseq.vi v8, v16, 0
 ; RV64-NEXT:    vse64.v v24, (a1), v0.t
+; RV64-NEXT:    vmseq.vi v8, v16, 0
 ; RV64-NEXT:    addi a0, a1, 128
 ; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    addi a1, sp, 16
@@ -550,8 +550,8 @@ define void @masked_store_v64i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    vse32.v v24, (a1), v0.t
+; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    addi a1, sp, 16
@@ -616,8 +616,8 @@ define void @masked_store_v128i16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    vse16.v v24, (a1), v0.t
+; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    addi a1, sp, 16
@@ -664,8 +664,8 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    vse8.v v24, (a1), v0.t
+; CHECK-NEXT:    vmseq.vi v8, v16, 0
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    addi a1, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index e9e147861df564..9b77f801545da0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1937,8 +1937,8 @@ define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
@@ -2034,8 +2034,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -2052,8 +2052,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
@@ -2274,8 +2274,8 @@ define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
@@ -2369,8 +2369,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -2387,8 +2387,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
@@ -2693,8 +2693,8 @@ define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
@@ -2790,8 +2790,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -2808,8 +2808,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
@@ -3030,8 +3030,8 @@ define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
@@ -3125,8 +3125,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -3143,8 +3143,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 016f95bfef7e71..5e68d8cbb07556 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -1379,9 +1379,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl)
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, a1
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmsne.vi v9, v9, 0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1400,9 +1400,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl)
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv.s.x v9, a1
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmsne.vi v9, v9, 0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1427,10 +1427,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vmsltu.vx v9, v9, a1
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1452,10 +1452,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    vmsltu.vx v9, v9, a1
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1483,10 +1483,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vmsltu.vx v9, v9, a1
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1510,10 +1510,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    vmsltu.vx v9, v9, a1
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1543,10 +1543,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    vmsltu.vx v9, v10, a1
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1572,10 +1572,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    vmsltu.vx v9, v10, a1
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1607,10 +1607,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vid.v v12
 ; RV32-NEXT:    vmsltu.vx v9, v12, a1
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1638,10 +1638,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vid.v v12
 ; RV64-NEXT:    vmsltu.vx v9, v12, a1
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1754,10 +1754,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI72_0)
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vle8.v v12, (a2)
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vid.v v16
 ; RV32-NEXT:    vmsltu.vx v14, v16, a1
 ; RV32-NEXT:    vsext.vf4 v16, v12
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vi v14, v12, 4
@@ -1798,10 +1798,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    addi a2, a2, %lo(.LCPI72_0)
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle8.v v12, (a2)
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    vmsltu.vx v14, v16, a1
 ; RV64-NEXT:    vsext.vf4 v16, v12
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 05896d8ef6ffdf..bf86eb65855039 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -23,9 +23,8 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -37,9 +36,8 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -56,9 +54,8 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -70,9 +67,8 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -89,9 +85,8 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -103,8 +98,8 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
@@ -122,8 +117,8 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v14
@@ -136,8 +131,8 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
@@ -155,8 +150,8 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v20
@@ -175,8 +170,8 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
@@ -201,8 +196,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v24
@@ -222,8 +217,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v8, v8, v16
@@ -239,8 +234,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v24
@@ -268,9 +263,8 @@ define <vscale x 1 x float> @vfmax_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -285,9 +279,8 @@ define <vscale x 2 x float> @vfmax_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -302,8 +295,8 @@ define <vscale x 4 x float> @vfmax_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -319,8 +312,8 @@ define <vscale x 8 x float> @vfmax_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -336,8 +329,8 @@ define <vscale x 16 x float> @vfmax_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
@@ -353,9 +346,8 @@ define <vscale x 1 x double> @vfmax_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -370,8 +362,8 @@ define <vscale x 2 x double> @vfmax_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -387,8 +379,8 @@ define <vscale x 4 x double> @vfmax_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -404,8 +396,8 @@ define <vscale x 8 x double> @vfmax_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
@@ -459,9 +451,8 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -496,9 +487,8 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index ab07fff59b218f..5e3d2d49f5d0f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -52,9 +52,8 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -68,9 +67,8 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -124,9 +122,8 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -140,9 +137,8 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -198,9 +194,8 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -214,8 +209,8 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
@@ -274,8 +269,8 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v14
@@ -290,8 +285,8 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
@@ -363,8 +358,8 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v20
@@ -385,8 +380,8 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
@@ -586,8 +581,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmax.vv v8, v8, v24
@@ -677,8 +672,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
@@ -718,9 +713,8 @@ define <vscale x 1 x float> @vfmax_vv_nxv1f32_unmasked(<vscale x 1 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -752,9 +746,8 @@ define <vscale x 2 x float> @vfmax_vv_nxv2f32_unmasked(<vscale x 2 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -788,8 +781,8 @@ define <vscale x 4 x float> @vfmax_vv_nxv4f32_unmasked(<vscale x 4 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -824,8 +817,8 @@ define <vscale x 8 x float> @vfmax_vv_nxv8f32_unmasked(<vscale x 8 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -858,9 +851,8 @@ define <vscale x 1 x double> @vfmax_vv_nxv1f64_unmasked(<vscale x 1 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -894,8 +886,8 @@ define <vscale x 2 x double> @vfmax_vv_nxv2f64_unmasked(<vscale x 2 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v14
@@ -930,8 +922,8 @@ define <vscale x 4 x double> @vfmax_vv_nxv4f64_unmasked(<vscale x 4 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v20
@@ -979,8 +971,8 @@ define <vscale x 8 x double> @vfmax_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
@@ -1189,7 +1181,6 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -1197,6 +1188,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    vfmax.vv v8, v16, v8
@@ -1218,8 +1210,8 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index e9425939249878..9caad8b1f69ea0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -23,9 +23,8 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -37,9 +36,8 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -56,9 +54,8 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -70,9 +67,8 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -89,9 +85,8 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -103,8 +98,8 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
@@ -122,8 +117,8 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v14
@@ -136,8 +131,8 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
@@ -155,8 +150,8 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v20
@@ -175,8 +170,8 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
@@ -201,8 +196,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v24
@@ -222,8 +217,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v8, v8, v16
@@ -239,8 +234,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v24
@@ -268,9 +263,8 @@ define <vscale x 1 x float> @vfmin_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -285,9 +279,8 @@ define <vscale x 2 x float> @vfmin_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -302,8 +295,8 @@ define <vscale x 4 x float> @vfmin_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -319,8 +312,8 @@ define <vscale x 8 x float> @vfmin_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -336,8 +329,8 @@ define <vscale x 16 x float> @vfmin_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
@@ -353,9 +346,8 @@ define <vscale x 1 x double> @vfmin_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -370,8 +362,8 @@ define <vscale x 2 x double> @vfmin_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -387,8 +379,8 @@ define <vscale x 4 x double> @vfmin_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -404,8 +396,8 @@ define <vscale x 8 x double> @vfmin_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
@@ -459,9 +451,8 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
@@ -496,9 +487,8 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v9, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index fc5b11284dab0c..3385effad0e865 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -52,9 +52,8 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -68,9 +67,8 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -124,9 +122,8 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -140,9 +137,8 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -198,9 +194,8 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v11
 ; ZVFH-NEXT:    ret
@@ -214,8 +209,8 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
@@ -274,8 +269,8 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v14
@@ -290,8 +285,8 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
@@ -363,8 +358,8 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v20
@@ -385,8 +380,8 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
@@ -586,8 +581,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; ZVFH-NEXT:    vfmin.vv v8, v8, v24
@@ -677,8 +672,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
@@ -718,9 +713,8 @@ define <vscale x 1 x float> @vfmin_vv_nxv1f32_unmasked(<vscale x 1 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -752,9 +746,8 @@ define <vscale x 2 x float> @vfmin_vv_nxv2f32_unmasked(<vscale x 2 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -788,8 +781,8 @@ define <vscale x 4 x float> @vfmin_vv_nxv4f32_unmasked(<vscale x 4 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -824,8 +817,8 @@ define <vscale x 8 x float> @vfmin_vv_nxv8f32_unmasked(<vscale x 8 x float> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -858,9 +851,8 @@ define <vscale x 1 x double> @vfmin_vv_nxv1f64_unmasked(<vscale x 1 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v10, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v11
 ; CHECK-NEXT:    ret
@@ -894,8 +886,8 @@ define <vscale x 2 x double> @vfmin_vv_nxv2f64_unmasked(<vscale x 2 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v14
@@ -930,8 +922,8 @@ define <vscale x 4 x double> @vfmin_vv_nxv4f64_unmasked(<vscale x 4 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v20
@@ -979,8 +971,8 @@ define <vscale x 8 x double> @vfmin_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
@@ -1189,7 +1181,6 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -1197,6 +1188,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    vfmin.vv v8, v16, v8
@@ -1218,8 +1210,8 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 1395dc914bb402..6cfc98645f1702 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2361,10 +2361,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    mv a0, a2
+; CHECK-NOV-NEXT:    and a0, a2, s0
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2768,10 +2767,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    mv a0, a2
+; CHECK-NOV-NEXT:    and a0, a2, s0
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3173,10 +3171,9 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s2
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    mv a0, a2
+; CHECK-NOV-NEXT:    and a0, a2, s1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
index b891207341b33e..54bdf984e82d95 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
@@ -21,7 +21,7 @@ body:             |
     ; CHECK-NEXT: renamable $v0 = COPY killed renamable $v1
     ; CHECK-NEXT: renamable $v9 = PseudoVMERGE_VIM_M1 undef renamable $v9, killed renamable $v3, 1, killed renamable $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: renamable $v0 = PseudoVADD_VV_M1 undef renamable $v0, killed renamable $v8, killed renamable $v9, 1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: PseudoRET implicit $v0
+    ; CHECK-NEXT: PseudoRET implicit killed $v0
     %0:vr = COPY $v0
     %1:vr = COPY $v1
     %2:vr = COPY $v2
diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
index 47b88ba71d5560..4c5fd3a10a4d54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
@@ -20,7 +20,9 @@ define signext i32 @foo(i32 signext %aa) #0 {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    andi sp, sp, -16
 ; CHECK-NEXT:    mv s1, sp
-; CHECK-NEXT:    lw t0, 44(s1)
+; CHECK-NEXT:    sw a0, 52(s1)
+; CHECK-NEXT:    sw a0, 48(s1)
+; CHECK-NEXT:    lw a0, 44(s1)
 ; CHECK-NEXT:    lw a2, 40(s1)
 ; CHECK-NEXT:    lw a3, 36(s1)
 ; CHECK-NEXT:    lw a4, 32(s1)
@@ -30,14 +32,11 @@ define signext i32 @foo(i32 signext %aa) #0 {
 ; CHECK-NEXT:    lw t1, 16(s1)
 ; CHECK-NEXT:    lw a1, 12(s1)
 ; CHECK-NEXT:    lw t2, 8(s1)
-; CHECK-NEXT:    sw a0, 52(s1)
-; CHECK-NEXT:    sw a0, 48(s1)
 ; CHECK-NEXT:    addi sp, sp, -32
 ; CHECK-NEXT:    sd t2, 16(sp)
 ; CHECK-NEXT:    sd a1, 8(sp)
 ; CHECK-NEXT:    addi a1, s1, 48
 ; CHECK-NEXT:    sd t1, 0(sp)
-; CHECK-NEXT:    mv a0, t0
 ; CHECK-NEXT:    call gfunc
 ; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index eb02fd895f18d7..63005716860135 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -23,10 +23,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
 ; CHECK-NEXT:    vrgather.vv v9, v8, v12, v0.t
-; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vrgather.vv v13, v10, v12
 ; CHECK-NEXT:    vadd.vi v10, v11, -15
+; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10, v0.t
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
 ; CHECK-NEXT:    vmv.v.v v0, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index bcb008857ad320..6a4ebb6b30af26 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -97,10 +97,10 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
+; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
index 2e5b67c93fce1a..aa0a2e4a52f317 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfeq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
 ; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
index b5ca47707c8a82..1bc25a72c63762 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
 ; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
index 971249d38d1b26..09629b6aa7f05d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
 ; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
index f19a181a365afc..7b45abb5a42660 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
 ; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
index 0a046422193342..cea5a388c749b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmflt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
 ; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
index 520099247e0f3d..6e216757ae1b2b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -295,9 +292,8 @@ define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -503,9 +498,8 @@ define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmfne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
 ; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
index 9f181f7a30ebed..b9fa97e199ea67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmseq.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmseq.vv v0, v8, v9
 ; CHECK-NEXT:    vmseq.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
index 75fc407abbc2f3..0815fca81042ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v9, v8
 ; CHECK-NEXT:    vmsle.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
index 5568c1e9b1cfb9..07adbad865580a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsleu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
index f1fa6484d976b4..c43c1643478518 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v9, v8
 ; CHECK-NEXT:    vmslt.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
index de7a0ad87be27c..9cef421e43bf4d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v9, v8
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v9, v8
 ; CHECK-NEXT:    vmsltu.vv v11, v10, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
index f54aef3ed4052c..2d0e2e52159b90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsle.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsle.vv v0, v8, v9
 ; CHECK-NEXT:    vmsle.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
index 540577247484e3..7a7f7bf8dc3848 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsleu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsleu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
index 554d25172d4fde..ecdf673f45f4be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmslt.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmslt.vv v0, v8, v9
 ; CHECK-NEXT:    vmslt.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
index 7a8efa6c80fb6b..e5c8800caa5f1d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsltu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vmsltu.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
index bd6bd8a804bcc2..b657247ee9249b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
@@ -35,9 +35,8 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -87,9 +86,8 @@ define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -139,9 +137,8 @@ define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -191,9 +188,8 @@ define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -347,9 +343,8 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -399,9 +394,8 @@ define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -451,9 +445,8 @@ define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -607,9 +600,8 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    ret
@@ -659,9 +651,8 @@ define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
@@ -815,9 +806,8 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x
 ; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv1r.v v11, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vmsne.vv v0, v8, v9
 ; CHECK-NEXT:    vmsne.vv v11, v9, v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v0, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index b1980fcf420a82..9156457b4d240d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -494,9 +494,9 @@ define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %v
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmseq.vi v7, v24, 0
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
+; CHECK-NEXT:    vmseq.vi v7, v24, 0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vle64.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
index c3b19b59ec3d68..40c3401363dda2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
@@ -12,9 +12,8 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, pt
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vadd.vv v12, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    vs4r.v v12, (a0)
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %index = add <vscale x 4 x i64> %x, %x
   store <vscale x 4 x i64> %index, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
index 2bac1eeb90609d..56c80f33694f5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
@@ -14,7 +14,7 @@ body:     |
     ; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm
     ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0  /* tu, mu */, implicit $vxrm, implicit $vl, implicit $vtype
-    ; MIR-NEXT: PseudoRET implicit $v8
+    ; MIR-NEXT: PseudoRET implicit killed $v8
     ; ASM-LABEL: verify_vxrm:
     ; ASM:        # %bb.0:
     ; ASM-NEXT:    csrwi	vxrm, 0
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed7257..6910f71e7271a4 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -64,9 +64,8 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sra a1, a1, a2
 ; RV32I-NEXT:    bltz a4, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a3, a3, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    srai a1, a3, 31
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB2_2:
 ; RV32I-NEXT:    srl a0, a0, a2
@@ -421,9 +420,8 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    sra a1, a1, a2
 ; RV64I-NEXT:    bltz a4, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    srai a3, a3, 63
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    srai a1, a3, 63
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    srl a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 7fc4713ac2d6e1..55ccb4d130c860 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -21,10 +21,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    lh s0, 12(a1)
 ; RV32I-NEXT:    lh s1, 8(a1)
 ; RV32I-NEXT:    lh s2, 4(a1)
-; RV32I-NEXT:    lh a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lh a0, 0(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, -124
@@ -113,10 +112,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    lh s0, 24(a1)
 ; RV64I-NEXT:    lh s1, 16(a1)
 ; RV64I-NEXT:    lh s2, 8(a1)
-; RV64I-NEXT:    lh a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lh a0, 0(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, -124
@@ -209,10 +207,9 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    lh s0, 12(a1)
 ; RV32I-NEXT:    lh s1, 8(a1)
 ; RV32I-NEXT:    lh s2, 4(a1)
-; RV32I-NEXT:    lh a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lh a0, 0(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
@@ -294,10 +291,9 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    lh s0, 24(a1)
 ; RV64I-NEXT:    lh s1, 16(a1)
 ; RV64I-NEXT:    lh s2, 8(a1)
-; RV64I-NEXT:    lh a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lh a0, 0(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
@@ -775,10 +771,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lh s0, 12(a1)
 ; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lh a0, 4(a1)
 ; RV32I-NEXT:    li a1, 654
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
@@ -852,10 +847,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lh s0, 24(a1)
 ; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    lh a0, 8(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
@@ -1091,11 +1085,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    lw s3, 20(a1)
 ; RV32I-NEXT:    lw s4, 8(a1)
 ; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -1160,11 +1153,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    lw s3, 20(a1)
 ; RV32IM-NEXT:    lw s4, 8(a1)
 ; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
+; RV32IM-NEXT:    lw a0, 0(a1)
+; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    li a2, 1
-; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -1220,10 +1212,9 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    ld s0, 24(a1)
 ; RV64I-NEXT:    ld s1, 16(a1)
-; RV64I-NEXT:    ld a2, 8(a1)
 ; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    ld a0, 8(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index d3e495bb723ad8..818f803e2da18f 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -19,11 +19,10 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 define void @caller_extern(ptr %src) optsize {
 ; CHECK-LABEL: caller_extern:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(dest)
-; CHECK-NEXT:    addi a1, a1, %lo(dest)
-; CHECK-NEXT:    li a2, 7
 ; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    lui a0, %hi(dest)
+; CHECK-NEXT:    addi a0, a0, %lo(dest)
+; CHECK-NEXT:    li a2, 7
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
 entry:
@@ -36,11 +35,10 @@ entry:
 define void @caller_extern_pgso(ptr %src) !prof !14 {
 ; CHECK-LABEL: caller_extern_pgso:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(dest_pgso)
-; CHECK-NEXT:    addi a1, a1, %lo(dest_pgso)
-; CHECK-NEXT:    li a2, 7
 ; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    lui a0, %hi(dest_pgso)
+; CHECK-NEXT:    addi a0, a0, %lo(dest_pgso)
+; CHECK-NEXT:    li a2, 7
 ; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
 entry:
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 10497db6edc49f..9033bdc966f757 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -136,9 +136,8 @@ define i64 @load_i64(ptr %p) {
 ;
 ; RV32I-FAST-LABEL: load_i64:
 ; RV32I-FAST:       # %bb.0:
-; RV32I-FAST-NEXT:    lw a2, 0(a0)
 ; RV32I-FAST-NEXT:    lw a1, 4(a0)
-; RV32I-FAST-NEXT:    mv a0, a2
+; RV32I-FAST-NEXT:    lw a0, 0(a0)
 ; RV32I-FAST-NEXT:    ret
 ;
 ; RV64I-FAST-LABEL: load_i64:
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index c057c656e0fb70..be0ca4609fd193 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -22,10 +22,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    lhu s0, 12(a1)
 ; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
-; RV32I-NEXT:    lhu a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lhu a0, 0(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 124
@@ -101,10 +100,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    lhu s0, 24(a1)
 ; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
-; RV64I-NEXT:    lhu a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lhu a0, 0(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 124
@@ -184,10 +182,9 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    lhu s0, 12(a1)
 ; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
-; RV32I-NEXT:    lhu a2, 0(a1)
 ; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    lhu a0, 0(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
@@ -253,10 +250,9 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    lhu s0, 24(a1)
 ; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
-; RV64I-NEXT:    lhu a2, 0(a1)
 ; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    lhu a0, 0(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
@@ -536,10 +532,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu s3, 0(a1)
-; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a0, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    andi a1, s3, 63
 ; RV32I-NEXT:    andi a2, s2, 31
@@ -588,10 +583,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu s3, 0(a1)
-; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a0, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    andi a1, s3, 63
 ; RV64I-NEXT:    andi a2, s2, 31
@@ -644,10 +638,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lhu s0, 12(a1)
 ; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu a2, 4(a1)
 ; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lhu a0, 4(a1)
 ; RV32I-NEXT:    li a1, 654
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
@@ -710,10 +703,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lhu s0, 24(a1)
 ; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu a2, 8(a1)
 ; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    lhu a0, 8(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
@@ -799,11 +791,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    lw s3, 20(a1)
 ; RV32I-NEXT:    lw s4, 8(a1)
 ; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -868,11 +859,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    lw s3, 20(a1)
 ; RV32IM-NEXT:    lw s4, 8(a1)
 ; RV32IM-NEXT:    lw s5, 12(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
+; RV32IM-NEXT:    lw a0, 0(a1)
+; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    li a2, 1
-; RV32IM-NEXT:    mv a0, a3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -928,10 +918,9 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    ld s0, 24(a1)
 ; RV64I-NEXT:    ld s1, 16(a1)
-; RV64I-NEXT:    ld a2, 8(a1)
 ; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    ld a0, 8(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
diff --git a/llvm/test/CodeGen/RISCV/wide-mem.ll b/llvm/test/CodeGen/RISCV/wide-mem.ll
index f98680a9f2daef..05270af824551d 100644
--- a/llvm/test/CodeGen/RISCV/wide-mem.ll
+++ b/llvm/test/CodeGen/RISCV/wide-mem.ll
@@ -7,9 +7,8 @@
 define i64 @load_i64(ptr %a) nounwind {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a0)
 ; RV32I-NEXT:    lw a1, 4(a0)
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    ret
   %1 = load i64, ptr %a
   ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd..8d6a5cd1c38b4c 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -560,9 +560,8 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    srai a1, a4, 31
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
 ; RV32I-NEXT:    lbu a4, 1(a0)
@@ -1094,9 +1093,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sra a1, a3, a5
 ; RV64I-NEXT:    bltz a6, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a4, 31
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    sraiw a1, a4, 31
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    lbu a4, 1(a0)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa..4c2ab1230ee6e0 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -543,9 +543,8 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    srai a1, a4, 31
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
 ; RV32I-NEXT:    lbu a4, 1(a0)
@@ -1203,9 +1202,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sra a1, a3, a5
 ; RV64I-NEXT:    bltz a6, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a4, 31
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    sraiw a1, a4, 31
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    lbu a4, 1(a0)
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index b1efe53290e8ee..c99b7e9a759fc5 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -866,11 +866,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    xor a6, a1, a5
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    and a1, a1, a6
-; RV32-NEXT:    slti a1, a1, 0
 ; RV32-NEXT:    sub a0, a0, a2
 ; RV32-NEXT:    sw a0, 0(a4)
+; RV32-NEXT:    slti a0, a1, 0
 ; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ssubo.i64:
@@ -890,11 +889,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    xor a6, a1, a5
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    and a1, a1, a6
-; RV32ZBA-NEXT:    slti a1, a1, 0
 ; RV32ZBA-NEXT:    sub a0, a0, a2
 ; RV32ZBA-NEXT:    sw a0, 0(a4)
+; RV32ZBA-NEXT:    slti a0, a1, 0
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: ssubo.i64:
@@ -914,11 +912,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    xor a6, a1, a5
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    and a1, a1, a6
-; RV32ZICOND-NEXT:    slti a1, a1, 0
 ; RV32ZICOND-NEXT:    sub a0, a0, a2
 ; RV32ZICOND-NEXT:    sw a0, 0(a4)
+; RV32ZICOND-NEXT:    slti a0, a1, 0
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ssubo.i64:
@@ -1367,11 +1364,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    xor a3, s0, a3
 ; RV32-NEXT:    or a1, a3, a1
-; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    sw a0, 0(a4)
+; RV32-NEXT:    snez a0, a1
 ; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1438,11 +1434,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    xor a3, s0, a3
 ; RV32ZBA-NEXT:    or a1, a3, a1
-; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    mul a0, a0, a2
 ; RV32ZBA-NEXT:    sw a0, 0(a4)
+; RV32ZBA-NEXT:    snez a0, a1
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
@@ -1509,11 +1504,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    xor a3, s0, a3
 ; RV32ZICOND-NEXT:    or a1, a3, a1
-; RV32ZICOND-NEXT:    snez a1, a1
 ; RV32ZICOND-NEXT:    mul a0, a0, a2
 ; RV32ZICOND-NEXT:    sw a0, 0(a4)
+; RV32ZICOND-NEXT:    snez a0, a1
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZICOND-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZICOND-NEXT:    addi sp, sp, 16
@@ -1556,11 +1550,10 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    add a1, a1, a5
 ; RV32-NEXT:    xor a1, a1, a7
 ; RV32-NEXT:    or a1, t0, a1
-; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    snez a0, a1
 ; RV32-NEXT:    sw a4, 4(a2)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.i64:
@@ -1595,12 +1588,11 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-NEXT:    add a1, a1, a5
 ; RV32ZBA-NEXT:    xor a1, a1, a7
 ; RV32ZBA-NEXT:    or a1, t0, a1
-; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    sh1add a3, a0, a0
 ; RV32ZBA-NEXT:    sh2add a0, a3, a0
 ; RV32ZBA-NEXT:    sw a0, 0(a2)
+; RV32ZBA-NEXT:    snez a0, a1
 ; RV32ZBA-NEXT:    sw a4, 4(a2)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.i64:
@@ -1634,11 +1626,10 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    add a1, a1, a5
 ; RV32ZICOND-NEXT:    xor a1, a1, a7
 ; RV32ZICOND-NEXT:    or a1, t0, a1
-; RV32ZICOND-NEXT:    snez a1, a1
 ; RV32ZICOND-NEXT:    mul a0, a0, a3
 ; RV32ZICOND-NEXT:    sw a0, 0(a2)
+; RV32ZICOND-NEXT:    snez a0, a1
 ; RV32ZICOND-NEXT:    sw a4, 4(a2)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.i64:
@@ -1663,10 +1654,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32-LABEL: umulo.i32:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mulhu a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    sw a0, 0(a2)
-; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i32:
@@ -1682,10 +1672,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32ZBA-LABEL: umulo.i32:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mulhu a3, a0, a1
-; RV32ZBA-NEXT:    snez a3, a3
 ; RV32ZBA-NEXT:    mul a0, a0, a1
 ; RV32ZBA-NEXT:    sw a0, 0(a2)
-; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    snez a0, a3
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i32:
@@ -1701,10 +1690,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32ZICOND-LABEL: umulo.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mulhu a3, a0, a1
-; RV32ZICOND-NEXT:    snez a3, a3
 ; RV32ZICOND-NEXT:    mul a0, a0, a1
 ; RV32ZICOND-NEXT:    sw a0, 0(a2)
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    snez a0, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i32:
@@ -1729,10 +1717,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a3, 13
 ; RV32-NEXT:    mulhu a2, a0, a3
-; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    sw a0, 0(a1)
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    snez a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i32:
@@ -1750,11 +1737,10 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, 13
 ; RV32ZBA-NEXT:    mulhu a2, a0, a2
-; RV32ZBA-NEXT:    snez a2, a2
 ; RV32ZBA-NEXT:    sh1add a3, a0, a0
 ; RV32ZBA-NEXT:    sh2add a0, a3, a0
 ; RV32ZBA-NEXT:    sw a0, 0(a1)
-; RV32ZBA-NEXT:    mv a0, a2
+; RV32ZBA-NEXT:    snez a0, a2
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i32:
@@ -1771,10 +1757,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a3, 13
 ; RV32ZICOND-NEXT:    mulhu a2, a0, a3
-; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    mul a0, a0, a3
 ; RV32ZICOND-NEXT:    sw a0, 0(a1)
-; RV32ZICOND-NEXT:    mv a0, a2
+; RV32ZICOND-NEXT:    snez a0, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i32:
@@ -1882,20 +1867,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    mulhu a3, a3, a0
 ; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    or a1, a1, a3
-; RV32-NEXT:    or a1, a1, a6
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    sw a0, 0(a4)
+; RV32-NEXT:    or a0, a1, a6
 ; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    mulhu a3, a0, a1
-; RV64-NEXT:    snez a3, a3
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sd a0, 0(a2)
-; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:    snez a0, a3
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.i64:
@@ -1915,20 +1898,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    mulhu a3, a3, a0
 ; RV32ZBA-NEXT:    snez a3, a3
 ; RV32ZBA-NEXT:    or a1, a1, a3
-; RV32ZBA-NEXT:    or a1, a1, a6
 ; RV32ZBA-NEXT:    mul a0, a0, a2
 ; RV32ZBA-NEXT:    sw a0, 0(a4)
+; RV32ZBA-NEXT:    or a0, a1, a6
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    mulhu a3, a0, a1
-; RV64ZBA-NEXT:    snez a3, a3
 ; RV64ZBA-NEXT:    mul a0, a0, a1
 ; RV64ZBA-NEXT:    sd a0, 0(a2)
-; RV64ZBA-NEXT:    mv a0, a3
+; RV64ZBA-NEXT:    snez a0, a3
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.i64:
@@ -1948,20 +1929,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    mulhu a3, a3, a0
 ; RV32ZICOND-NEXT:    snez a3, a3
 ; RV32ZICOND-NEXT:    or a1, a1, a3
-; RV32ZICOND-NEXT:    or a1, a1, a6
 ; RV32ZICOND-NEXT:    mul a0, a0, a2
 ; RV32ZICOND-NEXT:    sw a0, 0(a4)
+; RV32ZICOND-NEXT:    or a0, a1, a6
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    mulhu a3, a0, a1
-; RV64ZICOND-NEXT:    snez a3, a3
 ; RV64ZICOND-NEXT:    mul a0, a0, a1
 ; RV64ZICOND-NEXT:    sd a0, 0(a2)
-; RV64ZICOND-NEXT:    mv a0, a3
+; RV64ZICOND-NEXT:    snez a0, a3
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
@@ -1981,21 +1960,19 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    sltu a5, a4, a5
 ; RV32-NEXT:    mulhu a1, a1, a3
 ; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a1, a1, a5
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    or a0, a1, a5
 ; RV32-NEXT:    sw a4, 4(a2)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a3, 13
 ; RV64-NEXT:    mulhu a2, a0, a3
-; RV64-NEXT:    snez a2, a2
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    sd a0, 0(a1)
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    snez a0, a2
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.i64:
@@ -2008,23 +1985,21 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-NEXT:    sltu a4, a5, a4
 ; RV32ZBA-NEXT:    mulhu a1, a1, a3
 ; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    or a1, a1, a4
 ; RV32ZBA-NEXT:    sh1add a3, a0, a0
 ; RV32ZBA-NEXT:    sh2add a0, a3, a0
 ; RV32ZBA-NEXT:    sw a0, 0(a2)
+; RV32ZBA-NEXT:    or a0, a1, a4
 ; RV32ZBA-NEXT:    sw a5, 4(a2)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    li a2, 13
 ; RV64ZBA-NEXT:    mulhu a2, a0, a2
-; RV64ZBA-NEXT:    snez a2, a2
 ; RV64ZBA-NEXT:    sh1add a3, a0, a0
 ; RV64ZBA-NEXT:    sh2add a0, a3, a0
 ; RV64ZBA-NEXT:    sd a0, 0(a1)
-; RV64ZBA-NEXT:    mv a0, a2
+; RV64ZBA-NEXT:    snez a0, a2
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.i64:
@@ -2036,21 +2011,19 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    sltu a5, a4, a5
 ; RV32ZICOND-NEXT:    mulhu a1, a1, a3
 ; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    or a1, a1, a5
 ; RV32ZICOND-NEXT:    mul a0, a0, a3
 ; RV32ZICOND-NEXT:    sw a0, 0(a2)
+; RV32ZICOND-NEXT:    or a0, a1, a5
 ; RV32ZICOND-NEXT:    sw a4, 4(a2)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    li a3, 13
 ; RV64ZICOND-NEXT:    mulhu a2, a0, a3
-; RV64ZICOND-NEXT:    snez a2, a2
 ; RV64ZICOND-NEXT:    mul a0, a0, a3
 ; RV64ZICOND-NEXT:    sd a0, 0(a1)
-; RV64ZICOND-NEXT:    mv a0, a2
+; RV64ZICOND-NEXT:    snez a0, a2
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13)
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index 46aa383866e93a..bbf1df20aeda58 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -455,12 +455,12 @@ define ptr @swib(ptr %base, i32 %a, i32 %b) {
 define ptr @sdia(ptr %base, i64 %a, i64 %b) {
 ; RV32XTHEADMEMIDX-LABEL: sdia:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    addi a5, a0, 64
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a4
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a1, a3
 ; RV32XTHEADMEMIDX-NEXT:    sltu a1, a3, a1
 ; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
 ; RV32XTHEADMEMIDX-NEXT:    sw a3, 0(a0)
+; RV32XTHEADMEMIDX-NEXT:    addi a5, a0, 64
 ; RV32XTHEADMEMIDX-NEXT:    sw a1, 4(a0)
 ; RV32XTHEADMEMIDX-NEXT:    mv a0, a5
 ; RV32XTHEADMEMIDX-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
index 93931ff950a8c0..81ca7ca9bb8bdf 100644
--- a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
+++ b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
@@ -19,7 +19,7 @@ body:                   |
     ; CHECK-ZCMP32-LABEL: name: popret_rvlist5
     ; CHECK-ZCMP32: liveins: $x1, $x8
     ; CHECK-ZCMP32-NEXT: {{  $}}
-    ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8
+    ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4
@@ -41,7 +41,7 @@ body:                   |
     ; CHECK-ZCMP64-LABEL: name: popret_rvlist5
     ; CHECK-ZCMP64: liveins: $x1, $x8
     ; CHECK-ZCMP64-NEXT: {{  $}}
-    ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8
+    ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -16
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8
@@ -103,7 +103,7 @@ body:                   |
     ; CHECK-ZCMP32-LABEL: name: popretz_rvlist5
     ; CHECK-ZCMP32: liveins: $x1, $x8
     ; CHECK-ZCMP32-NEXT: {{  $}}
-    ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8
+    ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4
@@ -121,12 +121,12 @@ body:                   |
     ; CHECK-LIBCALL32-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-LIBCALL32-NEXT: $x8 = IMPLICIT_DEF
     ; CHECK-LIBCALL32-NEXT: $x10 = ADDI $x0, 0
-    ; CHECK-LIBCALL32-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit $x10
+    ; CHECK-LIBCALL32-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit killed $x10
     ;
     ; CHECK-ZCMP64-LABEL: name: popretz_rvlist5
     ; CHECK-ZCMP64: liveins: $x1, $x8
     ; CHECK-ZCMP64-NEXT: {{  $}}
-    ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8
+    ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -16
     ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8
@@ -144,7 +144,7 @@ body:                   |
     ; CHECK-LIBCALL64-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-LIBCALL64-NEXT: $x8 = IMPLICIT_DEF
     ; CHECK-LIBCALL64-NEXT: $x10 = ADDI $x0, 0
-    ; CHECK-LIBCALL64-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit $x10
+    ; CHECK-LIBCALL64-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit killed $x10
     ;
     ; CHECK-NO-ZCMP32-LABEL: name: popretz_rvlist5
     ; CHECK-NO-ZCMP32: liveins: $x1, $x8
@@ -161,7 +161,7 @@ body:                   |
     ; CHECK-NO-ZCMP32-NEXT: $x1 = LW $x2, 12 :: (load (s32) from %stack.0)
     ; CHECK-NO-ZCMP32-NEXT: $x8 = LW $x2, 8 :: (load (s32) from %stack.1)
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-destroy ADDI $x2, 16
-    ; CHECK-NO-ZCMP32-NEXT: PseudoRET implicit $x10
+    ; CHECK-NO-ZCMP32-NEXT: PseudoRET implicit killed $x10
     ;
     ; CHECK-NO-ZCMP64-LABEL: name: popretz_rvlist5
     ; CHECK-NO-ZCMP64: liveins: $x1, $x8
@@ -178,7 +178,7 @@ body:                   |
     ; CHECK-NO-ZCMP64-NEXT: $x1 = LD $x2, 8 :: (load (s64) from %stack.0)
     ; CHECK-NO-ZCMP64-NEXT: $x8 = LD $x2, 0 :: (load (s64) from %stack.1)
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-destroy ADDI $x2, 16
-    ; CHECK-NO-ZCMP64-NEXT: PseudoRET implicit $x10
+    ; CHECK-NO-ZCMP64-NEXT: PseudoRET implicit killed $x10
     $x1 = IMPLICIT_DEF
     $x8 = IMPLICIT_DEF
     $x10 = COPY $x0
diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
index f8557419c41990..abb7fff831afe5 100644
--- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
@@ -207,11 +207,10 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; ARM-NEXT:    .save {r4, lr}
 ; ARM-NEXT:    push {r4, lr}
 ; ARM-NEXT:    lsls r0, r0, #28
-; ARM-NEXT:    asrs r4, r0, #31
 ; ARM-NEXT:    lsls r1, r1, #28
 ; ARM-NEXT:    asrs r2, r1, #28
 ; ARM-NEXT:    asrs r3, r1, #31
-; ARM-NEXT:    mov r1, r4
+; ARM-NEXT:    asrs r1, r0, #31
 ; ARM-NEXT:    bl __aeabi_lmul
 ; ARM-NEXT:    cmp r1, #1
 ; ARM-NEXT:    bgt .LBB2_2
@@ -387,11 +386,10 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
 ; ARM-NEXT:    .save {r4, lr}
 ; ARM-NEXT:    push {r4, lr}
 ; ARM-NEXT:    lsls r0, r0, #28
-; ARM-NEXT:    asrs r4, r0, #31
 ; ARM-NEXT:    lsls r1, r1, #28
 ; ARM-NEXT:    asrs r2, r1, #28
 ; ARM-NEXT:    asrs r3, r1, #31
-; ARM-NEXT:    mov r1, r4
+; ARM-NEXT:    asrs r1, r0, #31
 ; ARM-NEXT:    bl __aeabi_lmul
 ; ARM-NEXT:    cmp r1, #0
 ; ARM-NEXT:    bmi .LBB5_2
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index 9b5fa1c2bc8113..b0cc1c68862983 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -9,10 +9,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-NEXT:    .pad #60
 ; THUMBV6-NEXT:    sub sp, #60
 ; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    mov r1, r2
-; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
 ; THUMBV6-NEXT:    mov r4, r0
 ; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r2
+; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
 ; THUMBV6-NEXT:    ldr r2, [sp, #88]
 ; THUMBV6-NEXT:    str r2, [sp, #48] @ 4-byte Spill
 ; THUMBV6-NEXT:    movs r5, #0
diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index 939ab71a8061c5..8b8ed524240c2d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -750,11 +750,10 @@ define arm_aapcs_vfpcc <4 x float> @frem_f32(<4 x float> %in1, <4 x float> %in2)
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    vmov r4, r2, d10
-; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d10
+; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s18, r6
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -885,11 +884,9 @@ define arm_aapcs_vfpcc <2 x double> @fdiv_f64(<2 x double> %in1, <2 x double> %i
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_ddiv
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_ddiv
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -912,11 +909,9 @@ define arm_aapcs_vfpcc <2 x double> @frem_f64(<2 x double> %in1, <2 x double> %i
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl fmod
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl fmod
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index d747da76a45fae..da3c0d66f22094 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -52,10 +52,8 @@ define arm_aapcs_vfpcc <2 x double> @sqrt_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl sqrt
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl sqrt
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -79,10 +77,9 @@ define arm_aapcs_vfpcc <4 x float> @cos_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -109,8 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -163,10 +159,8 @@ define arm_aapcs_vfpcc <2 x double> @cos_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl cos
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl cos
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -190,10 +184,9 @@ define arm_aapcs_vfpcc <4 x float> @sin_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl sinf
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -220,8 +213,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -274,10 +266,8 @@ define arm_aapcs_vfpcc <2 x double> @sin_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl sin
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl sin
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -301,10 +291,9 @@ define arm_aapcs_vfpcc <4 x float> @tan_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl tanf
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl tanf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -331,8 +320,7 @@ define arm_aapcs_vfpcc <8 x half> @tan_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl tanf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl tanf
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -385,10 +373,8 @@ define arm_aapcs_vfpcc <2 x double> @tan_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl tan
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl tan
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -412,10 +398,9 @@ define arm_aapcs_vfpcc <4 x float> @exp_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl expf
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -442,8 +427,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -496,10 +480,8 @@ define arm_aapcs_vfpcc <2 x double> @exp_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl exp
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl exp
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -523,10 +505,9 @@ define arm_aapcs_vfpcc <4 x float> @exp2_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl exp2f
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -553,8 +534,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -607,10 +587,8 @@ define arm_aapcs_vfpcc <2 x double> @exp2_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl exp2
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl exp2
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -634,10 +612,9 @@ define arm_aapcs_vfpcc <4 x float> @log_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl logf
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -664,8 +641,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -718,10 +694,8 @@ define arm_aapcs_vfpcc <2 x double> @log_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl log
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl log
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -745,10 +719,9 @@ define arm_aapcs_vfpcc <4 x float> @log2_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log2f
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -775,8 +748,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -829,10 +801,8 @@ define arm_aapcs_vfpcc <2 x double> @log2_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl log2
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl log2
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -856,10 +826,9 @@ define arm_aapcs_vfpcc <4 x float> @log10_float32_t(<4 x float> %src) {
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl log10f
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    vmov s18, r5
-; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -886,8 +855,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) {
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vcvtb.f16.f32 s20, s16
@@ -940,10 +908,8 @@ define arm_aapcs_vfpcc <2 x double> @log10_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl log10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl log10
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -970,11 +936,10 @@ define arm_aapcs_vfpcc <4 x float> @pow_float32_t(<4 x float> %src1, <4 x float>
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl powf
-; CHECK-NEXT:    vmov r4, r2, d10
-; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov r4, r0, d10
+; CHECK-NEXT:    vmov r5, r1, d8
 ; CHECK-NEXT:    vmov s18, r6
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r4
@@ -1075,11 +1040,9 @@ define arm_aapcs_vfpcc <2 x double> @pow_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl pow
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl pow
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
index f2ac5268921800..de4b24da27a8d4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
@@ -622,23 +622,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    vmovx.f16 s6, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vcvt.s32.f16 s5, s3
-; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vcvt.s32.f16 s5, s3
 ; CHECK-NEXT:    vcvt.s32.f16 s7, s2
-; CHECK-NEXT:    vcvt.s32.f16 s14, s0
+; CHECK-NEXT:    vcvt.s32.f16 s8, s1
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vcvt.s32.f16 s8, s1
+; CHECK-NEXT:    vcvt.s32.f16 s14, s0
+; CHECK-NEXT:    vcvt.s32.f16 s12, s0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
@@ -1704,23 +1704,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    vmovx.f16 s6, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vcvt.s32.f16 s5, s3
-; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmovx.f16 s0, s2
+; CHECK-NEXT:    vcvt.s32.f16 s5, s3
 ; CHECK-NEXT:    vcvt.s32.f16 s7, s2
-; CHECK-NEXT:    vcvt.s32.f16 s14, s0
+; CHECK-NEXT:    vcvt.s32.f16 s8, s1
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vcvt.s32.f16 s8, s1
+; CHECK-NEXT:    vcvt.s32.f16 s14, s0
+; CHECK-NEXT:    vcvt.s32.f16 s12, s0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 81b6a6940a7d6b..aa8c618b412748 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -1379,12 +1379,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
 ; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s8, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
@@ -1404,11 +1404,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s2
-; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s8, s8
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s6
@@ -1431,11 +1431,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s2
-; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vcvt.s32.f16 s8, s8
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s10
+; CHECK-NEXT:    vcvt.s32.f16 s8, s8
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s3
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
@@ -1465,9 +1465,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32(<8 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -4823,9 +4823,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> %
 ; CHECK-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -5233,14 +5233,13 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s17
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s19
 ; CHECK-NEXT:    vcmp.f32 s24, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s19, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s28
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
@@ -5249,7 +5248,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    vmov q5[2], q5[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcvtb.f32.f16 s17, s17
 ; CHECK-NEXT:    mov r6, r0
@@ -5293,14 +5292,13 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s18
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s16
 ; CHECK-NEXT:    vcmp.f32 s17, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s28
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
@@ -5309,7 +5307,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    vmov q6[2], q6[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
 ; CHECK-NEXT:    mov r6, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 5ab184a066e497..1849341ce72b7b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -1145,12 +1145,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
 ; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s8, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vcvt.u32.f16 s4, s4
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vcvt.u32.f16 s4, s4
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
@@ -1170,11 +1170,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s10, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s2
-; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s8, s8
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s6
@@ -1197,11 +1197,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s2
-; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vcvt.u32.f16 s8, s8
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s10, s10
+; CHECK-NEXT:    vcvt.u32.f16 s8, s8
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s6, s3
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
@@ -1231,9 +1231,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32(<8 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vcvt.u32.f16 s4, s4
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vcvt.u32.f16 s4, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -1769,8 +1769,8 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    str.w r0, [r8, #25]
-; CHECK-NEXT:    vmov r7, s17
 ; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r7, s17
 ; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
@@ -3735,9 +3735,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32_duplicate(<8 x half>
 ; CHECK-NEXT:    vcvt.u32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vcvt.u32.f16 s4, s4
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vcvt.u32.f16 s4, s4
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -4069,19 +4069,18 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s30
 ; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s28
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmov q5[2], q5[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    mov r6, r0
@@ -4113,19 +4112,18 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s17, s18
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s17, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s28
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmov q6[2], q6[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    mov r6, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll
index 1d7dcc8bf84407..b836a4b7014c71 100644
--- a/llvm/test/CodeGen/Thumb2/mve-frint.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll
@@ -60,10 +60,8 @@ define arm_aapcs_vfpcc <2 x double> @fceil_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl ceil
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl ceil
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -132,10 +130,8 @@ define arm_aapcs_vfpcc <2 x double> @ftrunc_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl trunc
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl trunc
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -204,10 +200,8 @@ define arm_aapcs_vfpcc <2 x double> @frint_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl rint
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl rint
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -266,10 +260,8 @@ define arm_aapcs_vfpcc <2 x double> @fnearbyint_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl nearbyint
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl nearbyint
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -338,10 +330,8 @@ define arm_aapcs_vfpcc <2 x double> @ffloor_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl floor
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl floor
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -410,10 +400,8 @@ define arm_aapcs_vfpcc <2 x double> @fround_float64_t(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl round
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl round
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index fe28f785623ed5..e67b2fe32b7e2a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: ext_add_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s2, s3
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.f32 s8, s3
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    add.w r12, r1, r0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r0, s0
diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 05f438acc3a7ec..e00908dd037bb1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -64,22 +64,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    asrs r1, r2, #31
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov.f32 s2, s21
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    asrs r1, r2, #31
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov d11, r0, r1
-; CHECK-NEXT:    asrs r3, r2, #31
+; CHECK-NEXT:    asrs r1, r2, #31
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov d10, r0, r1
 ; CHECK-NEXT:    vmov q1, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index f4643f8c6c4a1f..aaee97318ecd0d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -7,10 +7,10 @@
 define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) {
 ; CHECK-LABEL: shuffle1_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -30,10 +30,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) {
 ; CHECK-LABEL: shuffle3_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -923,10 +923,10 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
 ; CHECK-LABEL: shuffle2_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -948,10 +948,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) {
 ; CHECK-LABEL: shuffle1_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -971,10 +971,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) {
 ; CHECK-LABEL: shuffle3_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1383,10 +1383,10 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
 ; CHECK-LABEL: shuffle2_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index 6ce75500142964..de9328a3c24234 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -7,10 +7,10 @@
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -21,10 +21,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_67452301:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -69,10 +69,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_u7u5u3u1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -83,10 +83,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_6u4u2u0u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -120,10 +120,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cdef89ab45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -213,10 +213,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -227,10 +227,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -241,10 +241,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -261,9 +261,9 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <1
 ; CHECK-NEXT:    vmov.8 q1[9], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vmov.8 q1[11], r0
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s5, s2
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -278,10 +278,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -292,10 +292,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_67452301:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -340,10 +340,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_u7u5u3u1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -354,10 +354,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_6u4u2u0u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index b3f7b7d961ad02..bf921d2602f243 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -117,11 +117,9 @@ define arm_aapcs_vfpcc <2 x double> @add_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dadd
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_dadd
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -248,11 +246,9 @@ define arm_aapcs_vfpcc <2 x double> @sub_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dsub
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_dsub
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -381,11 +377,9 @@ define arm_aapcs_vfpcc <2 x double> @mul_float64_t(<2 x double> %src1, <2 x doub
 ; CHECK-NEXT:    vmov r0, r1, d11
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    bl __aeabi_dmul
-; CHECK-NEXT:    vmov lr, r12, d10
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, lr
-; CHECK-NEXT:    mov r1, r12
+; CHECK-NEXT:    vmov r0, r1, d10
+; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    bl __aeabi_dmul
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 1279714b5a78ca..04dcc77e9f937b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -374,17 +374,17 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y,
 ; CHECK-NEXT:  .LBB17_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov r7, s4
 ; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s10, s5
 ; CHECK-NEXT:    vmov.f32 s14, s11
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    asr.w r12, r3, #31
 ; CHECK-NEXT:    subs.w r8, r3, r4
+; CHECK-NEXT:    asr.w r12, r3, #31
 ; CHECK-NEXT:    sbc.w r12, r12, r4, asr #31
 ; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    vmov.f32 s10, s9
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index ad7a09fa50acba..368884802a3152 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -266,10 +266,8 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_int64(<2 x i64> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_l2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -290,10 +288,8 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_uint64(<2 x i64> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_ul2d
-; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
index a5725a2a300483..82db1a95037a9d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
@@ -18,10 +18,10 @@ entry:
 define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
 ; CHECK-LABEL: fpext_8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
 ; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
 ; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
 ; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
 ; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index b49f19e55c895a..271beac1392880 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -10,17 +10,15 @@ define void @vld4_v2i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov.f32 s8, s3
 ; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r2, r3
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add.w r12, r2, r0
 ; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s12
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index b005cb92dc5169..8e6e0191e26703 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -330,8 +330,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_t2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
@@ -420,8 +420,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index b36904495e878d..176246427e64ce 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -131,8 +131,8 @@ define void @vst4_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
@@ -897,8 +897,8 @@ define void @vst4_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
index 0d7354fcbbf2c5..bad1bf15c145f3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
@@ -62,23 +62,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32
 ; CHECK-NEXT:    vand q6, q0, q5
 ; CHECK-NEXT:    vmov r0, r1, d13
 ; CHECK-NEXT:    bl __aeabi_ul2d
-; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s2, s19
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    vand q5, q0, q5
+; CHECK-NEXT:    vmov r0, r1, d12
 ; CHECK-NEXT:    vmov r4, r5, d11
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vand q5, q0, q5
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl __aeabi_ul2d
-; CHECK-NEXT:    vmov r2, r3, d10
 ; CHECK-NEXT:    vmov d11, r0, r1
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    vmov r0, r1, d10
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov d10, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index a4d15a1b21d6b4..f9d6663b5b8a30 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -18,11 +18,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq 8(%rdi), %r18
 ; EGPR-NEXT:    movq 24(%rdi), %r29
 ; EGPR-NEXT:    movq 16(%rdi), %r17
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 32(%rdi), %r10
+; EGPR-NEXT:    movq 56(%rdi), %r15
 ; EGPR-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NEXT:    movq 32(%r24), %r10
-; EGPR-NEXT:    movq 56(%r24), %r15
 ; EGPR-NEXT:    movq 48(%r24), %r12
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq 24(%rsi), %r23
 ; EGPR-NEXT:    movq 16(%rsi), %r11
 ; EGPR-NEXT:    movq (%rsi), %r27
@@ -1295,15 +1295,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    adcq %rcx, %r29, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
+; EGPR-NDD-NEXT:    movq %r11, %r14
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 48(%r11), %r11
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq 48(%r15), %r11
 ; EGPR-NDD-NEXT:    movq %r17, %rsi
 ; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r10, %rsi
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    movq %r16, %r10
+; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    mulq %r11
 ; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..ab48bc292a40cc 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -332,11 +332,11 @@ define void @store_i256(ptr %ptr, i256 %v) {
 ; CHECK-O3:       # %bb.0:
 ; CHECK-O3-NEXT:    subq $40, %rsp
 ; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-O3-NEXT:    movq %rsi, (%rsp)
 ; CHECK-O3-NEXT:    movq %rdi, %rax
 ; CHECK-O3-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-O3-NEXT:    movq %rsi, (%rsp)
 ; CHECK-O3-NEXT:    movq %rsp, %rdx
 ; CHECK-O3-NEXT:    movl $32, %edi
 ; CHECK-O3-NEXT:    movq %rax, %rsi
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index 07e86cb01e133c..4937bcd110b781 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -392,10 +392,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xd9,0x02]
 ; X86-NEXT:    vmpsadbw $3, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xe1,0x03]
 ; X86-NEXT:    vmpsadbw $4, %zmm1, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xc9,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X86-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xc1,0x02]
 ; X86-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -403,10 +402,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8>
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xd9,0x02]
 ; X64-NEXT:    vmpsadbw $3, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xe1,0x03]
 ; X64-NEXT:    vmpsadbw $4, %zmm1, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xc9,0x42,0xd1,0x04]
-; X64-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X64-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xc1,0x02]
 ; X64-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %msk = bitcast i32 %x4 to <32 x i1>
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 31cec891c4cf38..1a9eeaeefad966 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -572,10 +572,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0,
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xd9,0x02]
 ; X86-NEXT:    vmpsadbw $3, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xe1,0x03]
 ; X86-NEXT:    vmpsadbw $4, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0x89,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x02]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -583,10 +582,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0,
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xd9,0x02]
 ; X64-NEXT:    vmpsadbw $3, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xe1,0x03]
 ; X64-NEXT:    vmpsadbw $4, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0x89,0x42,0xd1,0x04]
-; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x02]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %msk = bitcast i8 %x4 to <8 x i1>
@@ -606,10 +604,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xd9,0x02]
 ; X86-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xe1,0x03]
 ; X86-NEXT:    vmpsadbw $4, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xa9,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X86-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x02]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -617,10 +614,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> %
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xd9,0x02]
 ; X64-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xe1,0x03]
 ; X64-NEXT:    vmpsadbw $4, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xa9,0x42,0xd1,0x04]
-; X64-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X64-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x02]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %msk = bitcast i16 %x4 to <16 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b39b089faa2a5e..5d97a56ce12c6b 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -937,17 +937,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kmovw %k1, %r12d
 ; KNL-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r13d
-; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    andl $1, %edx
 ; KNL-NEXT:    movb %dl, 2(%rax)
 ; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    andl $1, %edx
 ; KNL-NEXT:    andl $1, %r9d
+; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    leal (%rdx,%r9,2), %r9d
 ; KNL-NEXT:    kmovw %k1, %edx
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    andl $1, %r8d
 ; KNL-NEXT:    leal (%r9,%r8,4), %r9d
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    andl $1, %esi
 ; KNL-NEXT:    leal (%r9,%rsi,8), %esi
@@ -1250,17 +1250,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kmovd %k1, %r12d
 ; SKX-NEXT:    kshiftrd $13, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r13d
-; SKX-NEXT:    kshiftrd $14, %k0, %k1
 ; SKX-NEXT:    andl $1, %edx
 ; SKX-NEXT:    movb %dl, 2(%rax)
 ; SKX-NEXT:    kmovd %k0, %edx
 ; SKX-NEXT:    andl $1, %edx
 ; SKX-NEXT:    andl $1, %r9d
+; SKX-NEXT:    kshiftrd $14, %k0, %k1
 ; SKX-NEXT:    leal (%rdx,%r9,2), %r9d
 ; SKX-NEXT:    kmovd %k1, %edx
-; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    andl $1, %r8d
 ; SKX-NEXT:    leal (%r9,%r8,4), %r9d
+; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %r8d
 ; SKX-NEXT:    andl $1, %esi
 ; SKX-NEXT:    leal (%r9,%rsi,8), %esi
@@ -1563,56 +1563,56 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ecx
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    movb %bl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebp
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
 ; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebx
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %esi
 ; KNL_X32-NEXT:    leal (%ebx,%esi,4), %ebx
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
 ; KNL_X32-NEXT:    leal (%ebx,%edi,8), %ebx
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
-; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $4, %edx
 ; KNL_X32-NEXT:    orl %ebx, %edx
+; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $5, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $6, %ebp
 ; KNL_X32-NEXT:    andl $1, %esi
 ; KNL_X32-NEXT:    shll $7, %esi
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    orl %ebp, %esi
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
 ; KNL_X32-NEXT:    shll $8, %edi
 ; KNL_X32-NEXT:    orl %esi, %edi
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    shll $9, %ebx
 ; KNL_X32-NEXT:    orl %edi, %ebx
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $10, %edx
 ; KNL_X32-NEXT:    orl %ebx, %edx
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL_X32-NEXT:    orl %ecx, %edx
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, %ecx
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $11, %ebp
@@ -1891,17 +1891,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kmovd %k1, %r12d
 ; FASTISEL-NEXT:    kshiftrd $13, %k0, %k1
 ; FASTISEL-NEXT:    kmovd %k1, %r13d
-; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
 ; FASTISEL-NEXT:    andl $1, %edx
 ; FASTISEL-NEXT:    movb %dl, 2(%rax)
 ; FASTISEL-NEXT:    kmovd %k0, %edx
 ; FASTISEL-NEXT:    andl $1, %edx
 ; FASTISEL-NEXT:    andl $1, %r9d
+; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
 ; FASTISEL-NEXT:    leal (%rdx,%r9,2), %r9d
 ; FASTISEL-NEXT:    kmovd %k1, %edx
-; FASTISEL-NEXT:    kshiftrd $15, %k0, %k0
 ; FASTISEL-NEXT:    andl $1, %r8d
 ; FASTISEL-NEXT:    leal (%r9,%r8,4), %r9d
+; FASTISEL-NEXT:    kshiftrd $15, %k0, %k0
 ; FASTISEL-NEXT:    kmovd %k0, %r8d
 ; FASTISEL-NEXT:    andl $1, %esi
 ; FASTISEL-NEXT:    leal (%r9,%rsi,8), %esi
@@ -3113,22 +3113,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-NEXT:    kmovw %k1, %eax
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    andb $1, %dl
 ; KNL_X32-NEXT:    addb %dl, %dl
+; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
 ; KNL_X32-NEXT:    orb %bl, %dl
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    shlb $2, %bl
 ; KNL_X32-NEXT:    orb %dl, %bl
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL_X32-NEXT:    andb $1, %dl
 ; KNL_X32-NEXT:    shlb $3, %dl
 ; KNL_X32-NEXT:    orb %bl, %dl
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    shlb $4, %bl
diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
index bafa33ff9a1c8a..2cace3060def4b 100644
--- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -9,21 +9,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_128:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_128:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_128:
@@ -70,21 +68,19 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_256:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_256:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_256:
@@ -138,21 +134,19 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_512:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_512:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512:
@@ -226,21 +220,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_128:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_128:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_128:
@@ -287,21 +279,19 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_256:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_256:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_256:
@@ -355,21 +345,19 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_512:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
+; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
+; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04]
 ; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_512:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; X64BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
+; X64BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
+; X64BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04]
 ; X64BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X64BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_512:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 2a77d0238721c0..113828ae54ccd2 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -313,10 +313,10 @@ define i16 @test15(ptr%addr) nounwind {
 define i16 @test16(ptr%addr, i16 %a) nounwind {
 ; KNL-LABEL: test16:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movw $-1025, %cx ## imm = 0xFBFF
 ; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -349,10 +349,10 @@ define i16 @test16(ptr%addr, i16 %a) nounwind {
 define i8 @test17(ptr%addr, i8 %a) nounwind {
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movw $-17, %cx
 ; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -843,12 +843,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; KNL-LABEL: test_insertelement_v32i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    movw $-17, %dx
 ; KNL-NEXT:    kmovw %edx, %k1
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -862,12 +862,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX-LABEL: test_insertelement_v32i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    cmpl %esi, %edi
-; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
 ; SKX-NEXT:    movl $-17, %ecx
 ; SKX-NEXT:    kmovd %ecx, %k1
+; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
@@ -889,10 +889,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    movw $-5, %cx
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
@@ -905,10 +905,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX-LABEL: test_iinsertelement_v4i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    cmpl %esi, %edi
-; SKX-NEXT:    setb %al
 ; SKX-NEXT:    movb $-5, %cl
 ; SKX-NEXT:    kmovd %ecx, %k1
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index b77c753107a6e1..eda085b0a5c4a2 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1013,9 +1013,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
 ; X64-NEXT:    vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
-; X64-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X64-NEXT:    vcvtps2ph $2, %zmm0, (%rsi)
-; X64-NEXT:    vmovdqa %ymm1, %ymm0
+; X64-NEXT:    vpaddw %ymm1, %ymm2, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_x86_vcvtps2ph_256:
@@ -1024,9 +1023,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
 ; X86-NEXT:    vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
-; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcvtps2ph $2, %zmm0, (%eax)
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm0
 ; X86-NEXT:    retl
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 9e689341f7b88e..373bc00d004bd6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1266,10 +1266,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    cmpl %edx, %esi
-; KNL-NEXT:    setg %al
 ; KNL-NEXT:    movw $-33, %cx
 ; KNL-NEXT:    kmovw %ecx, %k4
 ; KNL-NEXT:    kandw %k4, %k0, %k0
+; KNL-NEXT:    setg %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $10, %k4, %k4
@@ -1291,10 +1291,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
-; SKX-NEXT:    setg %al
 ; SKX-NEXT:    movq $-33, %rcx
 ; SKX-NEXT:    kmovq %rcx, %k1
 ; SKX-NEXT:    kandq %k1, %k0, %k0
+; SKX-NEXT:    setg %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlq $63, %k1, %k1
 ; SKX-NEXT:    kshiftrq $58, %k1, %k1
@@ -1306,10 +1306,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    cmpl %edx, %esi
-; AVX512BW-NEXT:    setg %al
 ; AVX512BW-NEXT:    movq $-33, %rcx
 ; AVX512BW-NEXT:    kmovq %rcx, %k1
 ; AVX512BW-NEXT:    kandq %k1, %k0, %k0
+; AVX512BW-NEXT:    setg %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
@@ -1329,10 +1329,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
 ; AVX512DQ-NEXT:    cmpl %edx, %esi
-; AVX512DQ-NEXT:    setg %al
 ; AVX512DQ-NEXT:    movw $-33, %cx
 ; AVX512DQ-NEXT:    kmovw %ecx, %k4
 ; AVX512DQ-NEXT:    kandw %k4, %k1, %k1
+; AVX512DQ-NEXT:    setg %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
 ; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
@@ -1355,11 +1355,11 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %al
 ; X86-NEXT:    kshiftrq $6, %k0, %k1
 ; X86-NEXT:    kshiftlq $6, %k1, %k1
 ; X86-NEXT:    kshiftlq $59, %k0, %k0
 ; X86-NEXT:    kshiftrq $59, %k0, %k0
+; X86-NEXT:    setg %al
 ; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    kshiftlq $63, %k1, %k1
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 51ffeca52a6652..9327ee800af19b 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -215,11 +215,10 @@ declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psll_dq_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xf8,0x08]
-; CHECK-NEXT:    # zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
 ; CHECK-NEXT:    vpslldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x04]
 ; CHECK-NEXT:    # zmm1 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
+; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xf8,0x08]
+; CHECK-NEXT:    # zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
@@ -251,11 +250,10 @@ declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrl_dq_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xd8,0x08]
-; CHECK-NEXT:    # zmm2 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vpsrldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x04]
 ; CHECK-NEXT:    # zmm1 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
+; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xd8,0x08]
+; CHECK-NEXT:    # zmm0 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
@@ -1961,9 +1959,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
 ; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
 ; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
@@ -2136,9 +2134,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
 ; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
 ; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
@@ -3017,10 +3015,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
-; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -3028,10 +3025,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X64-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
-; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 3, <32 x i16> zeroinitializer, i32 %x4)
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index 41e2aa003ce7a0..54f51be57536f0 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1244,10 +1244,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
-; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -1255,10 +1254,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X64-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4]
-; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2)
   %2 = bitcast i32 %x4 to <32 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index ae710cc40a522f..1cc481174c0030 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -6934,10 +6934,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
@@ -6945,10 +6944,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 3, <8 x i16> zeroinitializer, i8 %x4)
@@ -6967,10 +6965,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
@@ -6978,10 +6975,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 3, <16 x i16> zeroinitializer, i16 %x4)
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index f76b96eda75404..e55e469391542e 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1517,19 +1517,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmov_wb_128
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3]
 ; X86-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
 ; X86-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3]
 ; X64-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
 ; X64-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
-; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64-NEXT:    vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
@@ -1570,19 +1568,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovs_wb_12
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3]
 ; X86-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
 ; X86-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3]
 ; X64-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
 ; X64-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
-; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64-NEXT:    vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
@@ -1623,19 +1619,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovus_wb_1
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3]
 ; X86-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
 ; X86-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3]
 ; X64-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
 ; X64-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
-; X64-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64-NEXT:    vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
@@ -1994,10 +1988,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
@@ -2005,10 +1998,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -2031,10 +2023,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
-; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
 ; X86-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X86-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
@@ -2042,10 +2033,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02]
-; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03]
 ; X64-NEXT:    vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04]
+; X64-NEXT:    vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2)
   %2 = bitcast i16 %x4 to <16 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
index 1bcac8ff553d1f..268dbee3857277 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
@@ -732,10 +732,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
@@ -743,10 +742,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 23, <4 x i32> %x3, i8 -1)
@@ -765,10 +763,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshld_d_25
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
@@ -776,10 +773,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshld_d_25
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 23, <8 x i32> %x3, i8 -1)
@@ -798,10 +794,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshld_q_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
@@ -809,10 +804,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshld_q_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 23, <2 x i64> %x3, i8 -1)
@@ -831,10 +825,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshld_q_25
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
@@ -842,10 +835,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshld_q_25
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 23, <4 x i64> %x3, i8 -1)
@@ -864,10 +856,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshld_w_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xe1,0x06]
-; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x07]
 ; X86-NEXT:    vpshldw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x70,0xd1,0x08]
+; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc9,0x07]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
@@ -875,10 +866,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshld_w_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xe1,0x06]
-; X64-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x07]
 ; X64-NEXT:    vpshldw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x70,0xd1,0x08]
+; X64-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc9,0x07]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 6, <8 x i16> %x3, i8 %x4)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 7, <8 x i16> %x3, i8 -1)
@@ -896,10 +886,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w
 ; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshldw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xe1,0x06]
-; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x07]
 ; X86-NEXT:    vpshldw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x70,0xd1,0x08]
+; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc9,0x07]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
@@ -907,10 +896,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xe1,0x06]
-; X64-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x07]
 ; X64-NEXT:    vpshldw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x70,0xd1,0x08]
+; X64-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc9,0x07]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 6, <16 x i16> %x3, i16 %x4)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 7, <16 x i16> %x3, i16 -1)
@@ -929,10 +917,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
@@ -940,10 +927,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 23, <4 x i32> %x3, i8 -1)
@@ -962,10 +948,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrd_d_25
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
@@ -973,10 +958,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrd_d_25
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 23, <8 x i32> %x3, i8 -1)
@@ -995,10 +979,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrd_q_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
@@ -1006,10 +989,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrd_q_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 23, <2 x i64> %x3, i8 -1)
@@ -1028,10 +1010,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrd_q_25
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
@@ -1039,10 +1020,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrd_q_25
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 23, <4 x i64> %x3, i8 -1)
@@ -1061,10 +1041,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrd_w_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xe1,0x06]
-; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x07]
 ; X86-NEXT:    vpshrdw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x72,0xd1,0x08]
+; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc9,0x07]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
@@ -1072,10 +1051,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrd_w_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xe1,0x06]
-; X64-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x07]
 ; X64-NEXT:    vpshrdw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x72,0xd1,0x08]
+; X64-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc9,0x07]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 6, <8 x i16> %x3, i8 %x4)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 7, <8 x i16> %x3, i8 -1)
@@ -1093,10 +1071,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshrd_w
 ; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpshrdw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xe1,0x06]
-; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x07]
 ; X86-NEXT:    vpshrdw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x72,0xd1,0x08]
+; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc9,0x07]
 ; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
@@ -1104,10 +1081,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshrd_w
 ; X64-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xe1,0x06]
-; X64-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x07]
 ; X64-NEXT:    vpshrdw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x72,0xd1,0x08]
+; X64-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc9,0x07]
 ; X64-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 6, <16 x i16> %x3, i16 %x4)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 7, <16 x i16> %x3, i16 -1)
@@ -1126,10 +1102,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128_2:
@@ -1137,10 +1112,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -1301,10 +1275,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128_2:
@@ -1312,10 +1285,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22)
   %2 = bitcast i8 %x4 to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
index 9223885730d049..abcb7d3f1182a0 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
@@ -728,10 +728,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
@@ -739,10 +738,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
 ; X64-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
+; X64-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 22, i32 22, i32 22, i32 22>)
   %2 = bitcast i8 %x4 to <8 x i1>
@@ -897,10 +895,9 @@ define { <4 x i32>, <4 x i32>,  <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_1
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
@@ -908,10 +905,9 @@ define { <4 x i32>, <4 x i32>,  <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_1
 ; X64-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
 ; X64-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
+; X64-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
 ; X64-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> <i32 22, i32 22, i32 22, i32 22>)
   %2 = bitcast i8 %x4 to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c0bb0037923dce..21c26f3cd78ba8 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -10720,9 +10720,9 @@ declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
 define i8 at test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10779,9 +10779,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
 define i8 at test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10810,9 +10810,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
 define i8 at test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10843,9 +10843,9 @@ declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10902,9 +10902,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10933,9 +10933,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1d3b015f3c5479..386b63d551988b 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -178,13 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $156, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 58ea70e58028f1..8a1d247543cf96 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -208,11 +208,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebp, %ebp
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    orl $32, %ebp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl $64, %ebp
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    orl %ebx, %edx
diff --git a/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
index dfbd7278ce432a..fbd2a6752dfd5a 100644
--- a/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
@@ -96,9 +96,8 @@ define void @test_memcpy_args(ptr %Storage) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 8(%rdi), %rsi
-; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    movq (%rdi), %rdi
 ; CHECK-NEXT:    movl $1024, %edx # imm = 0x400
 ; CHECK-NEXT:    callq __llvm_memcpy_element_unordered_atomic_4 at PLT
 ; CHECK-NEXT:    popq %rax
@@ -210,9 +209,8 @@ define void @test_memmove_args(ptr %Storage) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 8(%rdi), %rsi
-; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    movq (%rdi), %rdi
 ; CHECK-NEXT:    movl $1024, %edx # imm = 0x400
 ; CHECK-NEXT:    callq __llvm_memmove_element_unordered_atomic_4 at PLT
 ; CHECK-NEXT:    popq %rax
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 0a52dfff71eda4..c483528ab327fa 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -154,9 +154,8 @@ define <4 x i64> @vsext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl)
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pslld $31, %xmm0
 ; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: vsext_v4i64_v4i1:
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90e075bfabf0a2..c10a1ae9b8fb78 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -34,12 +34,11 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a0:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -99,12 +98,11 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) n
 ; X86-NOBMI-LABEL: bextr32_a0_arithmetic:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -166,12 +164,11 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-LABEL: bextr32_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -232,10 +229,10 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-LABEL: bextr32_a2_load:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
@@ -302,10 +299,10 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-LABEL: bextr32_a3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
@@ -373,12 +370,11 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-NOBMI-LABEL: bextr32_a4_commutative:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -440,10 +436,10 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
@@ -2216,12 +2212,11 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b0:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -2281,12 +2276,11 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-LABEL: bextr32_b1_indexzext:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -2347,10 +2341,10 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-LABEL: bextr32_b2_load:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
@@ -2417,10 +2411,10 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-LABEL: bextr32_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
@@ -2488,12 +2482,11 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X86-NOBMI-LABEL: bextr32_b4_commutative:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -2555,10 +2548,10 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
index 62466bfa98ec2f..128e3d28909635 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
@@ -584,11 +584,10 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE41-NEXT:    pcmpeqq %xmm4, %xmm1
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    packssdw %xmm1, %xmm0
-; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
 ; SSE41-NEXT:    pslld $31, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; SSE2-LABEL: eq_or_to_abs_vec4x64_sext:
@@ -724,11 +723,10 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) {
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    packssdw %xmm1, %xmm0
 ; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
 ; SSE41-NEXT:    pslld $31, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; SSE2-LABEL: ne_and_to_abs_vec4x64_sext:
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index cc4d4c4543a515..b909a6b8139e3c 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -1469,13 +1469,12 @@ define <2 x i1> @isnan_v2f(<2 x float> %x) {
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %cl
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    setp %al
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setp %dl
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: isnan_v2f:
@@ -1498,13 +1497,12 @@ define <2 x i1> @isnot_nan_v2f(<2 x float> %x) {
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setnp %cl
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    setnp %al
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setnp %dl
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: isnot_nan_v2f:
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 0aa376195627d4..b24fa93ef4ecba 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -353,10 +353,9 @@ define <2 x double> @ldexp_v2f64(<2 x double> %val, <2 x i32> %exp) {
 ; WIN64-NEXT:    .seh_savexmm %xmm6, 32
 ; WIN64-NEXT:    .seh_endprologue
 ; WIN64-NEXT:    movaps (%rcx), %xmm6
-; WIN64-NEXT:    movl (%rdx), %eax
 ; WIN64-NEXT:    movl 4(%rdx), %esi
+; WIN64-NEXT:    movl (%rdx), %edx
 ; WIN64-NEXT:    movaps %xmm6, %xmm0
-; WIN64-NEXT:    movl %eax, %edx
 ; WIN64-NEXT:    callq ldexp
 ; WIN64-NEXT:    movaps %xmm0, %xmm7
 ; WIN64-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index 5e168a82e03e7e..ed09823b2b515f 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -77,13 +77,13 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shldl $28, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    shldl $28, %ebx, %edx
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    shldl $28, %ecx, %ebx
@@ -159,13 +159,13 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shldl $26, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    shldl $26, %ebx, %edx
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    shldl $26, %ecx, %ebx
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index ef85cd146d65f9..1f4aa669a67e5a 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1263,15 +1263,14 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm4, %xmm14
 ; SSE-NEXT:    mulpd %xmm13, %xmm14
 ; SSE-NEXT:    addpd %xmm10, %xmm14
-; SSE-NEXT:    movapd %xmm6, %xmm4
 ; SSE-NEXT:    mulpd %xmm6, %xmm13
 ; SSE-NEXT:    addpd %xmm15, %xmm13
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    mulpd %xmm6, %xmm8
 ; SSE-NEXT:    movapd %xmm7, %xmm10
 ; SSE-NEXT:    mulpd %xmm8, %xmm10
 ; SSE-NEXT:    addpd %xmm13, %xmm10
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT:    mulpd %xmm6, %xmm8
 ; SSE-NEXT:    addpd %xmm14, %xmm8
 ; SSE-NEXT:    movapd %xmm12, %xmm13
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0]
@@ -1289,8 +1288,8 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm5, %xmm14
 ; SSE-NEXT:    mulpd %xmm13, %xmm14
 ; SSE-NEXT:    addpd %xmm12, %xmm14
-; SSE-NEXT:    mulpd %xmm4, %xmm13
-; SSE-NEXT:    movapd %xmm4, %xmm2
+; SSE-NEXT:    mulpd %xmm6, %xmm13
+; SSE-NEXT:    movapd %xmm6, %xmm2
 ; SSE-NEXT:    addpd %xmm15, %xmm13
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1,1]
 ; SSE-NEXT:    movapd %xmm7, %xmm12
@@ -1644,7 +1643,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
 ; SSE-NEXT:    movaps %xmm14, %xmm15
 ; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0]
@@ -1655,17 +1653,18 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1]
 ; SSE-NEXT:    movaps %xmm3, %xmm10
 ; SSE-NEXT:    movaps %xmm3, %xmm12
+; SSE-NEXT:    addps %xmm15, %xmm0
 ; SSE-NEXT:    mulps %xmm0, %xmm10
 ; SSE-NEXT:    addps %xmm5, %xmm10
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    addps %xmm15, %xmm0
 ; SSE-NEXT:    movaps %xmm14, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2]
 ; SSE-NEXT:    movaps %xmm4, %xmm2
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
 ; SSE-NEXT:    movaps %xmm4, %xmm15
 ; SSE-NEXT:    mulps %xmm1, %xmm2
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
 ; SSE-NEXT:    addps %xmm0, %xmm2
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
 ; SSE-NEXT:    mulps %xmm11, %xmm1
 ; SSE-NEXT:    addps %xmm10, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3,3,3]
@@ -1914,8 +1913,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    movaps %xmm4, %xmm7
 ; SSE-NEXT:    mulps %xmm0, %xmm7
 ; SSE-NEXT:    addps %xmm1, %xmm7
-; SSE-NEXT:    movaps %xmm6, %xmm3
+; SSE-NEXT:    movaps %xmm12, %xmm15
 ; SSE-NEXT:    mulps %xmm6, %xmm0
+; SSE-NEXT:    movaps %xmm6, %xmm3
 ; SSE-NEXT:    addps %xmm2, %xmm0
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -1956,7 +1956,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    mulps %xmm1, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm14
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1]
-; SSE-NEXT:    movaps %xmm12, %xmm15
 ; SSE-NEXT:    movaps %xmm12, %xmm13
 ; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    mulps %xmm14, %xmm15
@@ -2750,7 +2749,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; AVX512F-NEXT:    vmulps %ymm6, %ymm8, %ymm6
 ; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm6
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
 ; AVX512F-NEXT:    vbroadcastss %xmm7, %ymm12
 ; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
 ; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3]
@@ -2799,8 +2797,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; AVX512F-NEXT:    vmulps %ymm1, %ymm10, %ymm1
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
-; AVX512F-NEXT:    vbroadcastss %xmm1, %ymm1
 ; AVX512F-NEXT:    vmulps %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512F-NEXT:    vbroadcastss %xmm1, %ymm1
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7]
 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
@@ -3294,7 +3293,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT:    movapd %xmm1, %xmm9
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
 ; SSE-NEXT:    movapd %xmm13, %xmm12
@@ -3303,7 +3303,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    mulpd %xmm12, %xmm10
 ; SSE-NEXT:    movapd %xmm2, %xmm8
 ; SSE-NEXT:    mulpd %xmm12, %xmm8
-; SSE-NEXT:    movapd %xmm1, %xmm9
 ; SSE-NEXT:    mulpd %xmm12, %xmm9
 ; SSE-NEXT:    mulpd %xmm0, %xmm12
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1,1]
@@ -3321,7 +3320,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    addpd %xmm12, %xmm13
 ; SSE-NEXT:    movapd %xmm11, %xmm6
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0]
-; SSE-NEXT:    movapd %xmm14, %xmm1
 ; SSE-NEXT:    mulpd %xmm6, %xmm1
 ; SSE-NEXT:    addpd %xmm13, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
@@ -3530,16 +3528,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm15, %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
 ; SSE-NEXT:    addpd %xmm3, %xmm4
-; SSE-NEXT:    movapd %xmm13, %xmm8
+; SSE-NEXT:    movapd %xmm10, %xmm5
 ; SSE-NEXT:    movapd %xmm13, %xmm3
+; SSE-NEXT:    mulpd %xmm1, %xmm5
+; SSE-NEXT:    addpd %xmm5, %xmm4
+; SSE-NEXT:    movapd %xmm13, %xmm8
 ; SSE-NEXT:    mulpd %xmm0, %xmm3
-; SSE-NEXT:    movapd %xmm10, %xmm5
 ; SSE-NEXT:    movapd %xmm10, %xmm15
-; SSE-NEXT:    mulpd %xmm1, %xmm5
 ; SSE-NEXT:    addpd %xmm3, %xmm5
 ; SSE-NEXT:    movapd %xmm12, %xmm10
 ; SSE-NEXT:    mulpd %xmm12, %xmm0
-; SSE-NEXT:    movapd %xmm14, %xmm9
 ; SSE-NEXT:    mulpd %xmm14, %xmm1
 ; SSE-NEXT:    addpd %xmm0, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
@@ -3589,24 +3587,24 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    addpd %xmm6, %xmm0
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
-; SSE-NEXT:    addpd %xmm5, %xmm4
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
 ; SSE-NEXT:    mulpd %xmm1, %xmm5
 ; SSE-NEXT:    addpd %xmm7, %xmm5
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
 ; SSE-NEXT:    mulpd %xmm2, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE-NEXT:    movapd %xmm14, %xmm9
 ; SSE-NEXT:    addpd %xmm3, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
 ; SSE-NEXT:    movapd %xmm7, %xmm3
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT:    mulpd %xmm3, %xmm2
+; SSE-NEXT:    mulpd %xmm7, %xmm2
 ; SSE-NEXT:    addpd %xmm1, %xmm2
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    mulpd %xmm3, %xmm1
+; SSE-NEXT:    mulpd %xmm7, %xmm1
 ; SSE-NEXT:    addpd %xmm5, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT:    mulpd %xmm3, %xmm5
+; SSE-NEXT:    mulpd %xmm7, %xmm5
 ; SSE-NEXT:    addpd %xmm4, %xmm5
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    mulpd %xmm4, %xmm3
@@ -3751,7 +3749,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    mulpd %xmm1, %xmm2
 ; SSE-NEXT:    addpd %xmm3, %xmm2
 ; SSE-NEXT:    mulpd %xmm0, %xmm11
-; SSE-NEXT:    movapd %xmm13, %xmm6
 ; SSE-NEXT:    movapd %xmm13, %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
 ; SSE-NEXT:    addpd %xmm11, %xmm4
@@ -3795,6 +3792,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0]
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
 ; SSE-NEXT:    mulpd %xmm9, %xmm3
+; SSE-NEXT:    movapd %xmm13, %xmm6
 ; SSE-NEXT:    addpd %xmm0, %xmm3
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
 ; SSE-NEXT:    mulpd %xmm9, %xmm10
@@ -3855,16 +3853,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm12, %xmm2
 ; SSE-NEXT:    mulpd %xmm1, %xmm2
 ; SSE-NEXT:    addpd %xmm3, %xmm2
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT:    movapd %xmm14, %xmm3
-; SSE-NEXT:    mulpd %xmm0, %xmm3
 ; SSE-NEXT:    movapd %xmm6, %xmm7
-; SSE-NEXT:    mulpd %xmm1, %xmm7
-; SSE-NEXT:    addpd %xmm3, %xmm7
 ; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT:    movapd %xmm14, %xmm3
+; SSE-NEXT:    mulpd %xmm1, %xmm7
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    mulpd %xmm0, %xmm3
 ; SSE-NEXT:    movapd %xmm4, %xmm3
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT:    addpd %xmm4, %xmm7
 ; SSE-NEXT:    mulpd %xmm0, %xmm3
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    movapd %xmm6, %xmm9
 ; SSE-NEXT:    mulpd %xmm1, %xmm9
 ; SSE-NEXT:    addpd %xmm3, %xmm9
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index 0299773aa67add..4f4c46c32868b2 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -5292,6 +5292,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, %r13
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %r14
@@ -5300,7 +5301,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %r8, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r11, %r13
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    addq %r14, %rax
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 2d7737bfdd3c2e..65bd1df5821042 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -304,13 +304,13 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq (%rdi), %rbx
 ; X64-NEXT:    movq 8(%rdi), %r11
 ; X64-NEXT:    movq 16(%rdi), %r10
-; X64-NEXT:    movq 16(%rsi), %r8
 ; X64-NEXT:    movq (%rsi), %r9
 ; X64-NEXT:    movq 8(%rsi), %r14
 ; X64-NEXT:    movq 24(%rdi), %r15
 ; X64-NEXT:    imulq %r9, %r15
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq 16(%rsi), %r8
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    imulq %r14, %r10
 ; X64-NEXT:    addq %rdx, %r10
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 64f6746e616ede..355f770fd17044 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1218,11 +1218,11 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rbx, %rsi
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rbx, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index f3741dc202dc58..d7b8734e96e396 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -277,12 +277,12 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32-NEXT:    pushl %esi
 ; CHECK32-NEXT:    pushl %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK32-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 2b475644a38cf6..f390c919f46d25 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1307,10 +1307,9 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_v8i64_sext:
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index c2a009f06b89df..b60b36744f038b 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -1111,10 +1111,10 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-LABEL: zext_mulhuw_v32i16_lshr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1206,10 +1206,10 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-LABEL: mulhsw_v32i16_lshr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1307,10 +1307,10 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-LABEL: mulhsw_v32i16_ashr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
@@ -1476,7 +1476,6 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-LABEL: zext_mulhuw_v64i16_lshr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
@@ -1485,6 +1484,7 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vpmulhuw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
@@ -1657,7 +1657,6 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-LABEL: mulhsw_v64i16_lshr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
@@ -1666,6 +1665,7 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
@@ -1839,7 +1839,6 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-LABEL: mulhsw_v64i16_ashr:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
@@ -1848,6 +1847,7 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
 ; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
diff --git a/llvm/test/CodeGen/X86/pointer-vector.ll b/llvm/test/CodeGen/X86/pointer-vector.ll
index aa9b977482fc1b..0e8c596783e576 100644
--- a/llvm/test/CodeGen/X86/pointer-vector.ll
+++ b/llvm/test/CodeGen/X86/pointer-vector.ll
@@ -5,9 +5,8 @@
 define <8 x ptr> @SHUFF0(<4 x ptr> %ptrv) nounwind {
 ; CHECK-LABEL: SHUFF0:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,1,2]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,1]
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,1,2]
 ; CHECK-NEXT:    retl
 entry:
   %G = shufflevector <4 x ptr> %ptrv, <4 x ptr> %ptrv, <8 x i32> <i32 2, i32 7, i32 1, i32 2, i32 4, i32 5, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll
index b0aa566a8235fd..bec10ac80f1ccf 100644
--- a/llvm/test/CodeGen/X86/pr11334.ll
+++ b/llvm/test/CodeGen/X86/pr11334.ll
@@ -63,11 +63,10 @@ define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind {
 ; SSE-NEXT:    cvtps2pd %xmm0, %xmm5
 ; SSE-NEXT:    cvtps2pd %xmm1, %xmm2
 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvtps2pd %xmm0, %xmm4
 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    cvtps2pd %xmm1, %xmm3
+; SSE-NEXT:    cvtps2pd %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm5, %xmm0
-; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v8f2d_ext_vec:
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 29922c2ac1a716..ea72b64ee00c01 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -49,13 +49,13 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
 ; AVX512VL-LABEL: test:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
-; AVX512VL-NEXT:    kshiftrb $2, %k0, %k1
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    testb $2, %al
 ; AVX512VL-NEXT:    fld1
 ; AVX512VL-NEXT:    fldz
 ; AVX512VL-NEXT:    fld %st(0)
 ; AVX512VL-NEXT:    fcmovne %st(2), %st
+; AVX512VL-NEXT:    kshiftrb $2, %k0, %k1
 ; AVX512VL-NEXT:    testb $1, %al
 ; AVX512VL-NEXT:    fld %st(1)
 ; AVX512VL-NEXT:    fcmovne %st(3), %st
diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll
index 1949841ea216bf..4fea9c8cffec6f 100644
--- a/llvm/test/CodeGen/X86/pr61964.ll
+++ b/llvm/test/CodeGen/X86/pr61964.ll
@@ -18,11 +18,11 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
@@ -31,10 +31,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm3, %ymm1
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splitTransposeDecode_8_avx2:
@@ -55,11 +54,11 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
 ; XOPAVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; XOPAVX1-NEXT:    retq
@@ -68,10 +67,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 ; XOPAVX2-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7]
-; XOPAVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
 ; XOPAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
 ; XOPAVX2-NEXT:    vpermd %ymm0, %ymm3, %ymm1
-; XOPAVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; XOPAVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shuffle.i = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   %shuffle.i59 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..779999816ebbf2 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -406,7 +406,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -429,6 +428,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    sarl $31, %eax
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -550,16 +550,16 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index 4a0a68ee322434..d5baa7459ff803 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -714,9 +714,8 @@ define fastcc void @t21_sret_to_sret_more_args2(ptr noalias sret(%struct.foo) %a
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    calll f_sret at PLT
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283df6010b..0771beecda770c 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -167,10 +167,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $30, %eax, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 85c966c447fad6..14db7ac90ef570 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -266,10 +266,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    cmpl $2, %edx
 ; X86-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
@@ -277,7 +277,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmpl $-2, %edx
 ; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    shrdl $2, %edx, %edi
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index b2b5bcc5b44b2c..8a5d3bb0936775 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -89,12 +89,12 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebx
@@ -583,12 +583,12 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebx
@@ -1293,9 +1293,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index f1fd05565c47e9..554548fa8f4c33 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -1659,17 +1659,19 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl 13(%rdi), %eax
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movzbl 12(%rdi), %r15d
 ; SCALAR-NEXT:    movzbl 11(%rdi), %eax
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
 ; SCALAR-NEXT:    movzbl 10(%rdi), %ebp
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    notb %r10b
 ; SCALAR-NEXT:    movzbl 9(%rdi), %r14d
 ; SCALAR-NEXT:    movzbl 8(%rdi), %eax
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl 7(%rdi), %r12d
-; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
 ; SCALAR-NEXT:    movzbl 5(%rdi), %r9d
 ; SCALAR-NEXT:    movzbl 4(%rdi), %ebx
+; SCALAR-NEXT:    movzbl 12(%rdi), %r15d
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
@@ -1680,11 +1682,9 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %bl
 ; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    notb %r10b
 ; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r12b
 ; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -4756,8 +4756,10 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl 8(%rdi), %r14d
 ; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
 ; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
-; SCALAR-NEXT:    movzbl 5(%rdi), %r15d
 ; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    notb %r9b
+; SCALAR-NEXT:    movzbl 5(%rdi), %r15d
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
@@ -4769,8 +4771,6 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movl %r15d, %r9d
 ; SCALAR-NEXT:    notb %r9b
@@ -6694,8 +6694,10 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl 8(%rdi), %r14d
 ; SCALAR-NEXT:    movzbl 7(%rdi), %ebp
 ; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
-; SCALAR-NEXT:    movzbl 5(%rdi), %ebx
 ; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    notb %r9b
+; SCALAR-NEXT:    movzbl 5(%rdi), %ebx
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
@@ -6707,8 +6709,6 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movl %ebx, %r9d
 ; SCALAR-NEXT:    notb %r9b
@@ -6761,12 +6761,13 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movb %r11b, 15(%rdx)
 ; SCALAR-NEXT:    movb %r8b, 14(%rdx)
 ; SCALAR-NEXT:    movb %al, 13(%rdx)
-; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SCALAR-NEXT:    movb %r8b, 4(%rdx)
 ; SCALAR-NEXT:    movb %al, 12(%rdx)
+; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %r13b, 11(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %r15b, 10(%rdx)
-; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %sil, 9(%rdx)
 ; SCALAR-NEXT:    movb %r12b, 8(%rdx)
 ; SCALAR-NEXT:    movb %r14b, 7(%rdx)
@@ -6775,13 +6776,13 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movl %r9d, %r11d
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
-; SCALAR-NEXT:    movb %r8b, 4(%rdx)
 ; SCALAR-NEXT:    movb %bpl, 3(%rdx)
 ; SCALAR-NEXT:    movb %dil, 2(%rdx)
 ; SCALAR-NEXT:    movb %cl, 1(%rdx)
 ; SCALAR-NEXT:    movl %ecx, %r14d
-; SCALAR-NEXT:    movl %r10d, %esi
 ; SCALAR-NEXT:    movb %r10b, (%rdx)
+; SCALAR-NEXT:    movb %al, 22(%rdx)
+; SCALAR-NEXT:    movl %r10d, %esi
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %cl, 31(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
@@ -6799,7 +6800,6 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %bl, 23(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT:    movb %al, 22(%rdx)
 ; SCALAR-NEXT:    movb %r11b, 21(%rdx)
 ; SCALAR-NEXT:    movb %r8b, 20(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c9..939452c98e0e37 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -88,13 +88,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -341,8 +340,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    movl %edx, %ebx
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714b49a4d4..f357e57b305991 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -123,10 +123,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $30, %eax, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 8c7078c7263284..f40276822b3fa7 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -195,15 +195,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdl $2, %edx, %esi
 ; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovael %ecx, %esi
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    shrdl $2, %edx, %ebx
@@ -392,13 +391,12 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %edi
 ; X86-NEXT:    cmovol %edi, %esi
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    cmovol %edi, %ebx
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 82603b35ba7128..901ad0acae21be 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -48,12 +48,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    leal (%ecx,%eax), %esi
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index af841cf38b24ae..fe8419d51e7052 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2686,10 +2686,9 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_8i16_to_8f32:
@@ -2732,10 +2731,9 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_8i8_to_8f32:
@@ -2780,10 +2778,9 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_16i8_to_8f32:
@@ -5162,16 +5159,14 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
 define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, ptr %p) nounwind {
 ; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; SSE-NEXT:    movss %xmm0, (%rdi)
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm1
 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
-; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
   %r = sitofp i32 %e to float
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe11f82a5..18ce7504927943 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -593,7 +593,8 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm3
-; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm1
@@ -606,8 +607,6 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm3
 ; SSE41-NEXT:    psrad $31, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v16i8:
@@ -769,9 +768,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: saddo_v2i64:
@@ -792,9 +790,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: saddo_v2i64:
@@ -815,9 +812,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE41-NEXT:    pxor %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: saddo_v2i64:
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d06993da6365d8..1688bc31f7e53f 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -598,7 +598,8 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm3
-; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm1
@@ -611,8 +612,6 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm3
 ; SSE41-NEXT:    psrad $31, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v16i8:
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 6311678924d06a..00898ab313a3cc 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -195,11 +195,10 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
 ; AVX1-NEXT:    vmovq %xmm0, (%rdi)
-; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v3i32:
@@ -213,11 +212,10 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
 ; AVX2-NEXT:    vmovq %xmm0, (%rdi)
-; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: umulo_v3i32:
@@ -313,10 +311,9 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v4i32:
@@ -330,10 +327,9 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: umulo_v4i32:
@@ -529,12 +525,11 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
 ; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm0
+; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v6i32:
@@ -548,10 +543,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-NEXT:    retq
@@ -701,12 +696,11 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm0
+; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v8i32:
@@ -720,10 +714,9 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: umulo_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll
index 63ca7c6a005732..bac925c0407630 100644
--- a/llvm/test/CodeGen/X86/vector-interleave.ll
+++ b/llvm/test/CodeGen/X86/vector-interleave.ll
@@ -165,11 +165,10 @@ define <8 x double> @interleave2x4f64(<4 x double> %a, <4 x double> %b) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; AVX2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
-; AVX2-NEXT:    vmovapd %ymm2, %ymm0
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
 ; AVX2-NEXT:    retq
   %result = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x double> %result
@@ -203,11 +202,10 @@ define <8 x i64> @interleave2x4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
 ; AVX2-NEXT:    retq
   %result = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x i64> %result
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 00e43df15deea0..e71aa794640f75 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -946,15 +946,15 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; SSE-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rdi), %xmm10
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm13
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm13
+; SSE-NEXT:    movdqa 16(%rdi), %xmm7
 ; SSE-NEXT:    psrad $16, %xmm13
 ; SSE-NEXT:    packssdw %xmm0, %xmm13
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index f105e065866af8..26f2f17ad24046 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -1232,12 +1232,10 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i16_stride3_vf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $40, %rsp
-; SSE-NEXT:    movdqa 96(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm6
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm13
-; SSE-NEXT:    movdqa 160(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa (%rdi), %xmm15
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm10
@@ -1246,20 +1244,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm12
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm12, %xmm2
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm2, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    movdqa 160(%rdi), %xmm9
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1]
 ; SSE-NEXT:    movdqa %xmm11, %xmm8
 ; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1268,16 +1267,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm15, %xmm3
 ; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
+; SSE-NEXT:    movdqa 96(%rdi), %xmm5
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
 ; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index df28ac14a30c0a..0f9f83bafdf93f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1879,9 +1879,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index b18f08b62f0d4c..22262b414df1a2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -2411,21 +2411,21 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa (%rdi), %xmm10
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm9
-; SSE-NEXT:    movdqa 48(%rdi), %xmm5
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm12
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm3
-; SSE-NEXT:    movdqa 192(%rdi), %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 192(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
 ; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
 ; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
@@ -3673,12 +3673,12 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm6
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4887,21 +4887,21 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    subq $1016, %rsp # imm = 0x3F8
 ; SSE-NEXT:    movdqa 464(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 400(%rdi), %xmm8
+; SSE-NEXT:    movdqa 144(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 416(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 448(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 432(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 144(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm14
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    movdqa 400(%rdi), %xmm8
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3]
 ; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6560,9 +6560,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $1048, %rsp # imm = 0x418
 ; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
@@ -6574,6 +6573,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 605deed6536bf3..a53ca1ccaa668c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -1579,9 +1579,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i16_stride6_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $136, %rsp
-; SSE-NEXT:    movdqa 112(%rdi), %xmm9
-; SSE-NEXT:    movdqa 128(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm2
+; SSE-NEXT:    movdqa 128(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa (%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1592,10 +1591,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
+; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    movdqa 112(%rdi), %xmm9
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
@@ -1625,6 +1625,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm9, %xmm11
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
@@ -1657,11 +1658,12 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    # xmm9 = mem[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3]
 ; SSE-NEXT:    movdqa %xmm10, %xmm13
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm15, %xmm13
 ; SSE-NEXT:    pand %xmm10, %xmm9
 ; SSE-NEXT:    por %xmm13, %xmm9
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0]
 ; SSE-NEXT:    movdqa %xmm11, %xmm4
@@ -1672,8 +1674,6 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm7, %xmm5
 ; SSE-NEXT:    pandn %xmm7, %xmm10
 ; SSE-NEXT:    por %xmm2, %xmm10
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0]
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
@@ -4161,13 +4161,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm3
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
@@ -11381,8 +11381,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm11, %zmm24
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm0
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm11
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm19
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm0
@@ -11551,8 +11551,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
 ; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm17
 ; AVX512-FCP-NEXT:    vpternlogq $236, %ymm29, %ymm10, %ymm14
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index af340d15fe8f64..ec9f87b201a958 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -2027,21 +2027,21 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm6
 ; SSE-NEXT:    movaps 160(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movaps 144(%rdi), %xmm7
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm13
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm14
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
 ; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movaps 144(%rdi), %xmm7
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
 ; SSE-NEXT:    movdqa %xmm3, %xmm10
@@ -2067,7 +2067,6 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
 ; SSE-NEXT:    pand %xmm1, %xmm9
 ; SSE-NEXT:    por %xmm2, %xmm9
 ; SSE-NEXT:    movdqa %xmm10, %xmm2
@@ -4151,20 +4150,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm7
-; SSE-NEXT:    movaps 144(%rdi), %xmm10
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm9
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm12
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
+; SSE-NEXT:    movaps 144(%rdi), %xmm10
 ; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
@@ -7703,6 +7702,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm25, %zmm11
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm12
@@ -7714,7 +7714,6 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm12, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm25, %zmm11
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -8631,12 +8630,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 144(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
@@ -13252,23 +13251,23 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i16_stride7_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $1544, %rsp # imm = 0x608
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm11
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm11
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
 ; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
@@ -14102,13 +14101,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
-; AVX512-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX512-NEXT:    vmovdqa %ymm4, %ymm8
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm19
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm9
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm11
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm21
@@ -14127,7 +14127,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
 ; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm6, %ymm19
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -14175,7 +14174,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 640(%rdi), %ymm16
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512-NEXT:    vmovdqa 688(%rdi), %xmm3
@@ -14481,7 +14479,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti32x4 $1, %ymm12, %xmm25
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
@@ -14489,6 +14486,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15019,6 +15017,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm29
 ; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
@@ -15072,7 +15071,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm29
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermd %zmm26, %zmm23, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -17012,7 +17010,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm9, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm22
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7]
@@ -17025,6 +17022,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17085,7 +17083,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm30, %zmm16, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm7
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 1b637cd203c8f9..ff1e9cf28f2ea8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -3458,20 +3458,20 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm5
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm15
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 192(%rdi), %xmm3
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; SSE-NEXT:    movdqa %xmm0, %xmm7
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4473,8 +4473,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
 ; AVX2-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -4497,7 +4498,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
@@ -4934,8 +4934,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -4958,7 +4959,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
@@ -5395,8 +5395,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -5419,7 +5420,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
@@ -6261,7 +6261,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
 ; AVX512-FCP-NEXT:    vpermt2d %xmm22, %xmm0, %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
@@ -6289,6 +6288,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
@@ -7034,7 +7034,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %xmm22, %xmm0, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
@@ -7062,6 +7061,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
@@ -12886,8 +12886,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
 ; AVX512-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
 ; AVX512-NEXT:    vmovdqa 560(%rdi), %xmm1
@@ -12973,13 +12975,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vmovdqa %xmm8, %xmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
 ; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm4
@@ -13055,7 +13056,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -13389,10 +13389,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -13455,7 +13456,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
@@ -13852,7 +13852,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
@@ -13861,8 +13860,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
@@ -14726,8 +14726,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa 560(%rdi), %xmm1
@@ -14813,13 +14815,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm4
@@ -14895,7 +14896,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -15229,10 +15229,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -15295,7 +15296,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
@@ -15692,7 +15692,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
@@ -15701,8 +15700,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index afdeebc45ed0ab..65e3ba8b8200b1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -727,10 +727,9 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
 ; SSE-LABEL: load_i32_stride3_vf16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps 96(%rdi), %xmm6
+; SSE-NEXT:    movaps 144(%rdi), %xmm11
 ; SSE-NEXT:    movaps 128(%rdi), %xmm1
 ; SSE-NEXT:    movaps 112(%rdi), %xmm13
-; SSE-NEXT:    movaps 144(%rdi), %xmm11
 ; SSE-NEXT:    movaps 176(%rdi), %xmm10
 ; SSE-NEXT:    movaps 160(%rdi), %xmm9
 ; SSE-NEXT:    movaps (%rdi), %xmm7
@@ -739,16 +738,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 48(%rdi), %xmm15
 ; SSE-NEXT:    movaps 80(%rdi), %xmm14
-; SSE-NEXT:    movaps 64(%rdi), %xmm2
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
 ; SSE-NEXT:    movaps %xmm15, %xmm5
+; SSE-NEXT:    movaps 64(%rdi), %xmm2
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
 ; SSE-NEXT:    movaps %xmm7, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm8[0,2]
+; SSE-NEXT:    movaps 96(%rdi), %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
@@ -1198,31 +1198,31 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i32_stride3_vf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $392, %rsp # imm = 0x188
-; SSE-NEXT:    movaps 192(%rdi), %xmm4
+; SSE-NEXT:    movaps 240(%rdi), %xmm7
 ; SSE-NEXT:    movaps 224(%rdi), %xmm3
-; SSE-NEXT:    movaps 208(%rdi), %xmm14
 ; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 240(%rdi), %xmm7
+; SSE-NEXT:    movaps 208(%rdi), %xmm14
 ; SSE-NEXT:    movaps 272(%rdi), %xmm6
 ; SSE-NEXT:    movaps 256(%rdi), %xmm9
-; SSE-NEXT:    movaps (%rdi), %xmm13
+; SSE-NEXT:    movaps 48(%rdi), %xmm2
+; SSE-NEXT:    movaps 32(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 16(%rdi), %xmm8
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 32(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 48(%rdi), %xmm2
-; SSE-NEXT:    movaps 80(%rdi), %xmm1
+; SSE-NEXT:    movaps 192(%rdi), %xmm4
 ; SSE-NEXT:    movaps 64(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
+; SSE-NEXT:    movaps 80(%rdi), %xmm1
+; SSE-NEXT:    movaps (%rdi), %xmm13
 ; SSE-NEXT:    movaps %xmm1, %xmm12
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,2]
+; SSE-NEXT:    movaps 16(%rdi), %xmm8
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
 ; SSE-NEXT:    movaps %xmm6, %xmm10
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1259,6 +1259,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 352(%rdi), %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
+; SSE-NEXT:    movaps %xmm7, %xmm14
 ; SSE-NEXT:    movaps 336(%rdi), %xmm4
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1288,7 +1289,6 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm7, %xmm14
 ; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0]
@@ -1743,7 +1743,6 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i32_stride3_vf32:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $72, %rsp
-; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
@@ -1764,6 +1763,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm3
+; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
@@ -2145,19 +2145,19 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 448(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm11, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movaps 240(%rdi), %xmm7
+; SSE-NEXT:    movaps 48(%rdi), %xmm9
 ; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 272(%rdi), %xmm3
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 256(%rdi), %xmm13
 ; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 48(%rdi), %xmm9
 ; SSE-NEXT:    movaps 80(%rdi), %xmm1
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 64(%rdi), %xmm12
 ; SSE-NEXT:    movaps %xmm12, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
 ; SSE-NEXT:    movaps %xmm9, %xmm1
+; SSE-NEXT:    movaps 240(%rdi), %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
+; SSE-NEXT:    movaps 256(%rdi), %xmm13
 ; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 3874581e621b3a..aa23dcc824c72d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -1030,7 +1030,6 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $264, %rsp # imm = 0x108
 ; AVX-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX-NEXT:    vmovaps 224(%rdi), %ymm3
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
@@ -1041,13 +1040,14 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
 ; AVX-NEXT:    vmovaps %ymm2, %ymm10
-; AVX-NEXT:    vmovaps %ymm1, %ymm3
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4]
+; AVX-NEXT:    vmovaps %ymm1, %ymm3
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 176(%rdi), %xmm6
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0]
 ; AVX-NEXT:    vmovaps %xmm6, %xmm2
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 144(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm6
@@ -1776,9 +1776,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 304(%rdi), %xmm8
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 288(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 336(%rdi), %xmm10
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 320(%rdi), %xmm6
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1799,8 +1798,9 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm1
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 288(%rdi), %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
 ; SSE-NEXT:    movaps %xmm6, %xmm5
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
@@ -3515,17 +3515,14 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 144(%rdi), %xmm14
 ; SSE-NEXT:    movaps 176(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 160(%rdi), %xmm5
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 208(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 192(%rdi), %xmm8
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 80(%rdi), %xmm10
 ; SSE-NEXT:    movaps 240(%rdi), %xmm6
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 224(%rdi), %xmm7
 ; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 80(%rdi), %xmm10
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 64(%rdi), %xmm4
 ; SSE-NEXT:    movaps 112(%rdi), %xmm2
@@ -3535,10 +3532,13 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
+; SSE-NEXT:    movaps 224(%rdi), %xmm7
 ; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    movaps 192(%rdi), %xmm8
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm7, %xmm1
+; SSE-NEXT:    movaps 160(%rdi), %xmm5
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
 ; SSE-NEXT:    movaps %xmm8, %xmm6
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
@@ -4673,19 +4673,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 704(%rdi), %ymm7
 ; AVX2-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 448(%rdi), %ymm4
-; AVX2-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm9
+; AVX2-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-NEXT:    vmovaps 224(%rdi), %ymm14
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
 ; AVX2-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-NEXT:    vpermps %ymm10, %ymm2, %ymm1
-; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -5166,19 +5166,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
 ; AVX2-FP-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vpermps %ymm10, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -5659,19 +5659,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
 ; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index dd94dffa859329..1238b1c0976286 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -686,34 +686,34 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
 ; SSE-LABEL: load_i32_stride5_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa 144(%rdi), %xmm3
-; SSE-NEXT:    movdqa 64(%rdi), %xmm0
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm5
-; SSE-NEXT:    movdqa 80(%rdi), %xmm11
+; SSE-NEXT:    movapd 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm1
 ; SSE-NEXT:    movdqa (%rdi), %xmm14
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm7
-; SSE-NEXT:    movdqa 32(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm2
+; SSE-NEXT:    movdqa 32(%rdi), %xmm9
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm14, %xmm8
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1]
+; SSE-NEXT:    movdqa 64(%rdi), %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm11, %xmm10
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE-NEXT:    movapd %xmm11, %xmm10
+; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1]
+; SSE-NEXT:    movdqa 144(%rdi), %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1]
@@ -1293,24 +1293,23 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 272(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 256(%rdi), %xmm8
-; SSE-NEXT:    movdqa (%rdi), %xmm11
+; SSE-NEXT:    movdqa 208(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm5
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 208(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm7
-; SSE-NEXT:    movdqa 160(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movapd 160(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa (%rdi), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm7, %xmm9
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
@@ -1344,7 +1343,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1664,12 +1663,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-LABEL: load_i32_stride5_vf16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
@@ -1685,6 +1683,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1782,12 +1781,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-LABEL: load_i32_stride5_vf16:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $72, %rsp
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm3
@@ -1803,6 +1801,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1900,12 +1899,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i32_stride5_vf16:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $72, %rsp
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
@@ -1921,6 +1919,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -2520,32 +2519,30 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i32_stride5_vf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $904, %rsp # imm = 0x388
-; SSE-NEXT:    movdqa (%rdi), %xmm11
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm5
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm9
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 448(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 432(%rdi), %xmm4
-; SSE-NEXT:    movdqa 400(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 416(%rdi), %xmm14
-; SSE-NEXT:    movdqa 128(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movapd 128(%rdi), %xmm6
+; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm7
-; SSE-NEXT:    movdqa 80(%rdi), %xmm12
-; SSE-NEXT:    movdqa 96(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm1, %xmm15
-; SSE-NEXT:    movdqa %xmm12, %xmm1
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm15
+; SSE-NEXT:    movapd 80(%rdi), %xmm12
+; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa 400(%rdi), %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1]
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE-NEXT:    movdqa (%rdi), %xmm11
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
@@ -2564,9 +2561,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 320(%rdi), %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 336(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 320(%rdi), %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm1, %xmm9
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3355,17 +3352,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -3634,17 +3631,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -3913,17 +3910,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -5007,11 +5004,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 432(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 400(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 128(%rdi), %xmm7
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 416(%rdi), %xmm9
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 128(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -5020,6 +5016,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    movdqa 400(%rdi), %xmm10
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
@@ -6745,27 +6742,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
+; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
@@ -7302,27 +7299,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
+; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
@@ -7859,27 +7856,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 8820dccc40bf4b..864e41510030b2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -1663,30 +1663,29 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    subq $408, %rsp # imm = 0x198
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm9
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 256(%rdi), %xmm3
-; SSE-NEXT:    movdqa 192(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 208(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 336(%rdi), %xmm14
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 304(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 352(%rdi), %xmm5
-; SSE-NEXT:    movdqa 288(%rdi), %xmm15
-; SSE-NEXT:    movdqa 304(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm12
-; SSE-NEXT:    movdqa (%rdi), %xmm8
+; SSE-NEXT:    movdqa 288(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    movapd (%rdi), %xmm8
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa 208(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm1, %xmm11
+; SSE-NEXT:    movdqa 256(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm8, %xmm1
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 192(%rdi), %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
 ; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1]
 ; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2052,6 +2051,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3]
 ; AVX-NEXT:    vmovaps %ymm7, %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7]
 ; AVX-NEXT:    vmovups (%rsp), %ymm7 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
@@ -2067,7 +2067,6 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7]
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm3
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3]
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
@@ -2145,13 +2144,13 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm4
 ; AVX2-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm13
 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
@@ -2322,13 +2321,13 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
@@ -2502,12 +2501,12 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm7
@@ -3395,19 +3394,17 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 544(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 480(%rdi), %xmm8
 ; SSE-NEXT:    movdqa 496(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm10
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm2
-; SSE-NEXT:    movdqa 96(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 112(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm1
+; SSE-NEXT:    movdqa 96(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 480(%rdi), %xmm8
+; SSE-NEXT:    movdqa 112(%rdi), %xmm11
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
 ; SSE-NEXT:    movdqa %xmm2, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index ed316990e48666..b268c4a984cc19 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -2018,21 +2018,21 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i32_stride7_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $440, %rsp # imm = 0x1B8
-; SSE-NEXT:    movdqa 304(%rdi), %xmm3
+; SSE-NEXT:    movdqa 240(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 272(%rdi), %xmm5
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm15
-; SSE-NEXT:    movdqa 240(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm7
 ; SSE-NEXT:    movdqa (%rdi), %xmm2
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm14
-; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm4
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 304(%rdi), %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -4210,30 +4210,30 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    subq $1160, %rsp # imm = 0x488
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rdi), %xmm13
-; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 640(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 640(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 608(%rdi), %xmm4
-; SSE-NEXT:    movdqa 560(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 576(%rdi), %xmm1
+; SSE-NEXT:    movdqa 560(%rdi), %xmm10
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 128(%rdi), %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 192(%rdi), %xmm7
+; SSE-NEXT:    movaps 128(%rdi), %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm9, %xmm12
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
 ; SSE-NEXT:    movdqa %xmm7, %xmm14
+; SSE-NEXT:    movdqa (%rdi), %xmm13
+; SSE-NEXT:    movdqa %xmm9, %xmm12
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
@@ -4490,8 +4490,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; SSE-NEXT:    movdqa 736(%rdi), %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3]
+; SSE-NEXT:    movdqa 736(%rdi), %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
 ; SSE-NEXT:    movdqa %xmm2, %xmm12
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8551,18 +8551,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 640(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 608(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 560(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 192(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 576(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 160(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm1
+; SSE-NEXT:    movdqa 160(%rdi), %xmm15
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    movdqa 560(%rdi), %xmm10
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -11204,13 +11204,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 1120(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
+; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12234,13 +12234,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 1120(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
+; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13262,22 +13262,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
+; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 2fd173c729170b..9448acd134008d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -2426,14 +2426,14 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 448(%rdi), %ymm7
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; AVX-NEXT:    vmovaps 480(%rdi), %ymm9
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX-NEXT:    vmovaps 480(%rdi), %ymm9
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5]
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
@@ -4401,9 +4401,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 608(%rdi), %xmm6
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 576(%rdi), %xmm7
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 672(%rdi), %xmm8
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 640(%rdi), %xmm4
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4413,19 +4412,20 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm10
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 128(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 224(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 192(%rdi), %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 128(%rdi), %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm1, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 576(%rdi), %xmm7
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm3
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
@@ -9221,9 +9221,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 352(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movaps 320(%rdi), %xmm6
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 416(%rdi), %xmm7
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 384(%rdi), %xmm8
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9233,19 +9232,20 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm10
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 128(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 224(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 192(%rdi), %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 128(%rdi), %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm1, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 320(%rdi), %xmm6
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm8, %xmm3
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
@@ -15784,7 +15784,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -15802,14 +15801,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -15818,11 +15816,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-NEXT:    movb $-64, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -15869,10 +15869,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -16381,7 +16381,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -16399,14 +16398,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -16415,11 +16413,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    movb $-64, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -16466,10 +16466,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -16978,7 +16978,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512DQ-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -16996,14 +16995,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512DQ-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -17012,11 +17010,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    movb $-64, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -17063,10 +17063,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -17575,7 +17575,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -17593,14 +17592,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -17609,11 +17607,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -17660,10 +17660,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -18172,7 +18172,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512BW-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -18190,14 +18189,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -18206,11 +18204,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -18257,10 +18257,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -18769,7 +18769,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512BW-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -18787,14 +18786,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -18803,11 +18801,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-FCP-NEXT:    movb $-64, %al
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -18854,10 +18854,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -19366,7 +19366,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-BW-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512DQ-BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -19384,14 +19383,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -19400,11 +19398,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -19451,10 +19451,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm22
@@ -19963,7 +19963,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps 1536(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1728(%rdi), %zmm24
@@ -19981,14 +19980,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1344(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
@@ -19997,11 +19995,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
@@ -20048,10 +20048,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm22
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 6716d97b3f07c0..9ed29fc54dbc1f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -1409,7 +1409,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-NEXT:    vmovaps 160(%rdi), %xmm2
 ; AVX2-NEXT:    vinsertf128 $1, 224(%rdi), %ymm2, %ymm9
 ; AVX2-NEXT:    vmovaps 128(%rdi), %xmm2
@@ -1419,9 +1418,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-NEXT:    vmovaps 384(%rdi), %xmm7
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
@@ -1527,7 +1527,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vinsertf128 $1, 224(%rdi), %ymm2, %ymm9
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm2
@@ -1537,9 +1536,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-FP-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm7
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-FP-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
@@ -1645,7 +1645,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 224(%rdi), %ymm2, %ymm9
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm2
@@ -1655,9 +1654,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 70164cff890729..1ef07aabc54c91 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -798,18 +798,18 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-LABEL: load_i64_stride5_vf8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm1
-; AVX-NEXT:    vmovapd 256(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm9
-; AVX-NEXT:    vmovapd 96(%rdi), %ymm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm2[6,7]
 ; AVX-NEXT:    vmovapd 64(%rdi), %ymm7
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3]
-; AVX-NEXT:    vmovapd (%rdi), %xmm10
+; AVX-NEXT:    vmovaps (%rdi), %xmm10
+; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm8
-; AVX-NEXT:    vmovapd 32(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm11
-; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
-; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 256(%rdi), %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm4[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3]
 ; AVX-NEXT:    vmovapd %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 192(%rdi), %xmm5
@@ -849,9 +849,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3]
-; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1]
-; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm15[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
@@ -865,7 +865,7 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd %ymm11, 32(%r8)
 ; AVX-NEXT:    vmovapd %ymm8, (%r8)
 ; AVX-NEXT:    vmovapd %ymm0, 32(%r9)
-; AVX-NEXT:    vmovapd %ymm1, (%r9)
+; AVX-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -10952,7 +10952,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -10966,6 +10965,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -11420,7 +11420,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -11434,6 +11433,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -11888,7 +11888,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -11902,6 +11901,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -12356,7 +12356,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -12370,6 +12369,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -12824,7 +12824,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -12838,6 +12837,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -13292,7 +13292,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -13306,6 +13305,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -13760,7 +13760,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -13774,6 +13773,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
@@ -14228,7 +14228,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm4
@@ -14242,6 +14241,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 21e1b17760c246..3dbd078504caa3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -919,13 +919,13 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movaps 16(%rdi), %xmm7
 ; SSE-NEXT:    movaps 48(%rdi), %xmm1
 ; SSE-NEXT:    movaps 144(%rdi), %xmm2
-; SSE-NEXT:    movaps 96(%rdi), %xmm11
 ; SSE-NEXT:    movaps 240(%rdi), %xmm3
 ; SSE-NEXT:    movaps 192(%rdi), %xmm12
 ; SSE-NEXT:    movaps 336(%rdi), %xmm4
-; SSE-NEXT:    movaps 288(%rdi), %xmm9
 ; SSE-NEXT:    movaps %xmm9, %xmm14
+; SSE-NEXT:    movaps 288(%rdi), %xmm9
 ; SSE-NEXT:    movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0]
+; SSE-NEXT:    movaps 96(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm14, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1]
 ; SSE-NEXT:    movaps %xmm12, %xmm4
@@ -2502,10 +2502,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-LABEL: load_i64_stride6_vf16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-NEXT:    vmovaps 288(%rdi), %ymm12
 ; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2514,6 +2513,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-NEXT:    vmovaps %ymm0, %ymm15
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2716,10 +2716,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-LABEL: load_i64_stride6_vf16:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2728,6 +2727,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm15
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2930,10 +2930,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i64_stride6_vf16:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2942,6 +2941,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm15
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 1d1da0954d675f..16647d0da63c5b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -9706,11 +9706,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-LABEL: load_i64_stride7_vf32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -9723,7 +9726,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -9735,20 +9737,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -10180,11 +10179,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -10197,7 +10199,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -10209,20 +10210,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
 ; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -10654,11 +10652,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf32:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -10671,7 +10672,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -10683,20 +10683,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
 ; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512DQ-BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -11128,11 +11125,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -11145,7 +11145,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -11157,20 +11156,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -17266,25 +17262,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 3008(%rdi), %zmm19
 ; AVX512-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 2752(%rdi), %zmm18
 ; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 2432(%rdi), %zmm17
 ; AVX512-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 2368(%rdi), %zmm9
 ; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm13
 ; AVX512-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm14
@@ -17297,10 +17293,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm15
 ; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -17308,6 +17303,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18265,25 +18261,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm19
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
@@ -18296,10 +18292,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -18307,6 +18302,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19264,25 +19260,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 3008(%rdi), %zmm19
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 2752(%rdi), %zmm18
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 2432(%rdi), %zmm17
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 2368(%rdi), %zmm9
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm13
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm14
@@ -19295,10 +19291,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm15
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -19306,6 +19301,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -20263,25 +20259,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
@@ -20294,10 +20290,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -20305,6 +20300,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21261,15 +21257,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21283,19 +21280,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
@@ -22243,15 +22239,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
+; AVX512BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -22265,19 +22262,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
@@ -23225,15 +23221,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 3264(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
+; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512DQ-BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -23247,19 +23244,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
@@ -24207,15 +24203,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3264(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -24229,19 +24226,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index ceb4948726760b..80f628099ee897 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -4046,13 +4046,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4077,15 +4075,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4099,19 +4098,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4269,13 +4269,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4300,15 +4298,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4322,19 +4321,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4492,13 +4492,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512DQ-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4523,15 +4521,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4545,19 +4544,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4715,13 +4715,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4746,15 +4744,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4768,19 +4767,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4938,13 +4938,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4969,15 +4967,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4991,19 +4990,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5161,13 +5161,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5192,15 +5190,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5214,19 +5213,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5384,13 +5384,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512DQ-BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5415,15 +5413,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5437,19 +5436,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5607,13 +5607,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5638,15 +5636,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5660,19 +5659,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -8744,24 +8744,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    movb $-64, %al
+; AVX512-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512-NEXT:    movb $-64, %al
-; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -8804,6 +8804,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -8824,7 +8825,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8852,9 +8852,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -8867,7 +8868,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -8986,7 +8986,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -8998,6 +8997,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -9265,24 +9265,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    movb $-64, %al
+; AVX512-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512-FCP-NEXT:    movb $-64, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -9325,6 +9325,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -9345,7 +9346,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9373,9 +9373,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -9388,7 +9389,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -9507,7 +9507,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -9519,6 +9518,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -9786,24 +9786,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    movb $-64, %al
+; AVX512DQ-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512DQ-NEXT:    movb $-64, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -9846,6 +9846,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512DQ-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -9866,7 +9867,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9894,9 +9894,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -9909,7 +9910,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -10028,7 +10028,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -10040,6 +10039,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -10307,24 +10307,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    movb $-64, %al
+; AVX512DQ-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512DQ-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -10367,6 +10367,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -10387,7 +10388,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10415,9 +10415,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -10430,7 +10431,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -10549,7 +10549,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -10561,6 +10560,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -10828,24 +10828,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    movb $-64, %al
+; AVX512BW-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -10888,6 +10888,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -10908,7 +10909,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10936,9 +10936,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -10951,7 +10952,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -11070,7 +11070,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -11082,6 +11081,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -11349,24 +11349,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    movb $-64, %al
+; AVX512BW-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512BW-FCP-NEXT:    movb $-64, %al
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -11409,6 +11409,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -11429,7 +11430,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11457,9 +11457,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -11472,7 +11473,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -11591,7 +11591,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -11603,6 +11602,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -11870,24 +11870,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    movb $-64, %al
+; AVX512DQ-BW-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512DQ-BW-NEXT:    movb $-64, %al
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -11930,6 +11930,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -11950,7 +11951,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11978,9 +11978,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -11993,7 +11994,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -12112,7 +12112,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -12124,6 +12123,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -12391,24 +12391,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
-; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -12451,6 +12451,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm26
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -12471,7 +12472,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -12499,9 +12499,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
@@ -12514,7 +12515,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -12633,7 +12633,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -12645,6 +12644,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -18960,32 +18960,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    movb $-64, %al
+; AVX512-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    movb $-64, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -19025,10 +19025,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -20031,32 +20031,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    movb $-64, %al
+; AVX512-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    movb $-64, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -20096,10 +20096,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -21102,32 +21102,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    movb $-64, %al
+; AVX512DQ-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    movb $-64, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -21167,10 +21167,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512DQ-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -22173,32 +22173,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    movb $-64, %al
+; AVX512DQ-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -22238,10 +22238,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -23244,32 +23244,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    movb $-64, %al
+; AVX512BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -23309,10 +23309,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -24315,32 +24315,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    movb $-64, %al
+; AVX512BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    movb $-64, %al
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -24380,10 +24380,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -25386,32 +25386,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    movb $-64, %al
+; AVX512DQ-BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -25451,10 +25451,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
@@ -26457,32 +26457,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -26522,10 +26522,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1792(%rdi), %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index faecad65c395b2..429758c8350659 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -1037,9 +1037,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa (%rdi), %xmm6
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm8
-; SSE-NEXT:    movdqa 48(%rdi), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, %xmm7
 ; SSE-NEXT:    pandn %xmm4, %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
@@ -1053,8 +1052,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    por %xmm7, %xmm0
 ; SSE-NEXT:    pxor %xmm9, %xmm9
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
 ; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    pandn %xmm1, %xmm3
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
@@ -1075,6 +1074,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    movdqa 48(%rdi), %xmm12
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
 ; SSE-NEXT:    movdqa %xmm8, %xmm10
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 15f6ef4006fddf..60fcf25b507b7b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -1125,9 +1125,8 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm6
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm15
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
@@ -1135,6 +1134,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    packuswb %xmm0, %xmm7
 ; SSE-NEXT:    packuswb %xmm1, %xmm7
+; SSE-NEXT:    movdqa %xmm2, %xmm15
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
@@ -1959,9 +1959,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-LABEL: load_i8_stride4_vf64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $600, %rsp # imm = 0x258
-; SSE-NEXT:    movdqa 16(%rdi), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm15
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm7
@@ -1978,8 +1977,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm5
 ; SSE-NEXT:    pand %xmm9, %xmm1
-; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    packuswb %xmm13, %xmm1
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm2
 ; SSE-NEXT:    pand %xmm9, %xmm2
@@ -1989,9 +1988,10 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pand %xmm9, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 43a45b9fd59a75..c215e2dd9f4d9d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2125,10 +2125,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa 48(%rdi), %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
@@ -4083,12 +4083,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 208(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa 208(%rdi), %xmm4
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm1
@@ -4790,10 +4790,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, %xmm2
 ; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm8, %xmm15
 ; SSE-NEXT:    pand %xmm8, %xmm13
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index e4dc257543d20c..c7b73198c7f4d7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -1437,6 +1437,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm0, %xmm5
 ; SSE-NEXT:    por %xmm11, %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3]
+; SSE-NEXT:    pandn %xmm3, %xmm12
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm1, %xmm11
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7]
@@ -1446,7 +1447,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    por %xmm9, %xmm0
-; SSE-NEXT:    pandn %xmm3, %xmm12
 ; SSE-NEXT:    por %xmm12, %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0]
 ; SSE-NEXT:    pand %xmm1, %xmm9
@@ -2586,8 +2586,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm10, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm10, %xmm4
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    pandn %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
@@ -2673,7 +2674,6 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm5
-; SSE-NEXT:    movdqa %xmm10, %xmm4
 ; SSE-NEXT:    pandn %xmm14, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm12
@@ -4661,11 +4661,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
@@ -4674,6 +4673,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    movdqa (%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
@@ -4881,6 +4881,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm6
+; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm2
 ; SSE-NEXT:    movdqa %xmm7, %xmm8
 ; SSE-NEXT:    pandn %xmm6, %xmm8
@@ -4891,7 +4892,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm13, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
@@ -4926,6 +4926,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pandn %xmm6, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pand %xmm0, %xmm2
@@ -4945,8 +4947,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm2
@@ -6046,8 +6046,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm2, %xmm8
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vmovdqa %xmm14, %xmm13
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm14, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm3, %xmm14
 ; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
@@ -6435,9 +6435,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm7
 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vandps %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX-NEXT:    vandnps %ymm5, %ymm6, %ymm5
 ; AVX-NEXT:    vmovaps %ymm6, %ymm0
 ; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 130ae31b37bfe9..48ef75742eccb1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1087,7 +1087,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    movdqa %xmm4, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    pandn %xmm10, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
@@ -1097,6 +1096,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
@@ -1863,7 +1863,6 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-LABEL: load_i8_stride7_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $168, %rsp
-; SSE-NEXT:    movdqa 96(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm7
 ; SSE-NEXT:    movdqa (%rdi), %xmm6
@@ -1898,6 +1897,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm3
 ; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa 96(%rdi), %xmm15
 ; SSE-NEXT:    movdqa %xmm3, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535]
@@ -3619,17 +3619,16 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $648, %rsp # imm = 0x288
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm14
-; SSE-NEXT:    movdqa 192(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm4
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm3
-; SSE-NEXT:    movdqa 160(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa 160(%rdi), %xmm7
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm1
@@ -3647,6 +3646,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa 192(%rdi), %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
@@ -3835,10 +3835,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm5, %xmm7
 ; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    movdqa %xmm4, %xmm13
 ; SSE-NEXT:    por %xmm2, %xmm3
@@ -3942,11 +3942,11 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm12, %xmm7
 ; SSE-NEXT:    movdqa %xmm12, %xmm5
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm1, %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3]
+; SSE-NEXT:    movdqa %xmm12, %xmm7
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
@@ -4074,10 +4074,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    por %xmm1, %xmm10
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm3, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
 ; SSE-NEXT:    pand %xmm3, %xmm10
 ; SSE-NEXT:    por %xmm0, %xmm10
 ; SSE-NEXT:    packuswb %xmm5, %xmm0
@@ -4118,10 +4118,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-NEXT:    packuswb %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm6, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
@@ -7220,29 +7220,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 112(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 160(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa 160(%rdi), %xmm6
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm6
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm7
 ; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pxor %xmm6, %xmm6
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    movdqa 112(%rdi), %xmm4
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
@@ -7855,13 +7855,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm12, %xmm2
-; SSE-NEXT:    movdqa %xmm12, %xmm1
 ; SSE-NEXT:    movdqa %xmm9, %xmm4
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    movdqa %xmm12, %xmm2
 ; SSE-NEXT:    pandn %xmm9, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm15, %xmm4
 ; SSE-NEXT:    por %xmm0, %xmm4
@@ -9479,12 +9479,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
 ; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm14
 ; AVX-NEXT:    vpor %xmm5, %xmm14, %xmm5
-; AVX-NEXT:    vmovdqa %xmm12, %xmm14
 ; AVX-NEXT:    vpblendvb %xmm12, %xmm2, %xmm5, %xmm2
-; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm5
 ; AVX-NEXT:    vmovdqa %xmm13, %xmm9
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
+; AVX-NEXT:    vmovdqa %xmm12, %xmm14
+; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpshufb %xmm13, %xmm13, %xmm5
 ; AVX-NEXT:    vmovdqa %xmm1, %xmm12
 ; AVX-NEXT:    vpor %xmm5, %xmm2, %xmm1
 ; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
@@ -9936,8 +9936,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    # xmm9 = mem[0,0]
 ; AVX-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
 ; AVX-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX-NEXT:    vmovdqa %xmm0, %xmm11
 ; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
+; AVX-NEXT:    vmovdqa %xmm0, %xmm11
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX-NEXT:    vandps %ymm0, %ymm3, %ymm3
@@ -10148,22 +10148,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $760, %rsp # imm = 0x2F8
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm10
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm1, %ymm12
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
@@ -10687,22 +10687,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $760, %rsp # imm = 0x2F8
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpor %xmm3, %xmm0, %xmm0
@@ -11225,10 +11225,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $776, %rsp # imm = 0x308
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
@@ -11241,6 +11240,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
@@ -13521,7 +13521,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -13574,6 +13573,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -13894,7 +13894,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm22
-; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
 ; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm23
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -13914,6 +13913,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm20, %xmm15
@@ -14138,13 +14138,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
 ; AVX512BW-FCP-NEXT:    vpermw %zmm26, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm7
-; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
+; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
@@ -14254,7 +14254,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
 ; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -14307,6 +14306,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -14624,7 +14624,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm22
-; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -14644,6 +14643,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm20, %xmm15
@@ -14868,13 +14868,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm26, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
+; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index b1eb4d6af4eb70..72be7f0399fd52 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -3792,13 +3792,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-LABEL: load_i8_stride8_vf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $904, %rsp # imm = 0x388
-; SSE-NEXT:    movdqa 64(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 128(%rdi), %xmm6
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm10
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 96(%rdi), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 128(%rdi), %xmm6
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm14
@@ -3812,8 +3810,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm2
@@ -3824,16 +3822,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    movdqa 96(%rdi), %xmm12
+; SSE-NEXT:    movdqa 64(%rdi), %xmm5
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
-; SSE-NEXT:    movdqa 112(%rdi), %xmm15
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    movdqa 112(%rdi), %xmm15
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8619,15 +8619,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 208(%rdi), %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm9
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm12
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0]
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm9, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    movdqa 208(%rdi), %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
@@ -8647,10 +8647,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, %xmm3
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
-; SSE-NEXT:    movdqa 112(%rdi), %xmm14
-; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    movdqa 112(%rdi), %xmm14
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -13908,7 +13908,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm24
 ; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
 ; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm21
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -13921,6 +13920,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vmovdqa 416(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm13
 ; AVX512-NEXT:    vmovdqa 432(%rdi), %xmm11
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
@@ -14004,8 +14004,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm1
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm10
 ; AVX512-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -14172,6 +14171,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm20
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -14245,7 +14245,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm20
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
@@ -14312,6 +14311,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
@@ -14324,7 +14324,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
 ; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm30
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm13
@@ -14838,7 +14837,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -14856,8 +14854,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -14868,9 +14866,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm20
@@ -15224,7 +15223,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm24
 ; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm8, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm21
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -15237,6 +15235,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vmovdqa 416(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm13
 ; AVX512DQ-NEXT:    vmovdqa 432(%rdi), %xmm11
 ; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
@@ -15320,8 +15319,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm13, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm10
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -15488,6 +15486,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm20
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -15561,7 +15560,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm20
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
@@ -15628,6 +15626,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
@@ -15640,7 +15639,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm13
@@ -16154,7 +16152,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -16172,8 +16169,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -16184,9 +16181,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm20
@@ -16446,7 +16444,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
 ; AVX512BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-NEXT:    vmovdqa 496(%rdi), %xmm4
@@ -16462,22 +16459,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm4
 ; AVX512BW-NEXT:    vmovdqa64 %xmm6, %xmm26
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vmovdqa 448(%rdi), %xmm7
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm7, %xmm6
-; AVX512BW-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX512BW-NEXT:    vpmovqb %ymm4, %xmm4
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-NEXT:    vmovdqa 368(%rdi), %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovaps 368(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
 ; AVX512BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
@@ -16852,6 +16850,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX512BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm20
 ; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm13
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
@@ -16879,7 +16878,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm20
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpsrlq $32, %zmm17, %zmm3
 ; AVX512BW-NEXT:    vpmovqb %zmm3, %xmm3
@@ -17326,19 +17324,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
 ; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm18, %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm11
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
@@ -17573,7 +17571,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa 496(%rdi), %xmm4
@@ -17589,22 +17586,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm6, %xmm26
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-BW-NEXT:    vmovdqa 448(%rdi), %xmm7
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX512DQ-BW-NEXT:    vpmovqb %ymm4, %xmm4
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa 368(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vmovaps 368(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
@@ -17979,6 +17977,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm13
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
@@ -18006,7 +18005,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm29, %xmm3
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm17, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, %xmm3
@@ -18453,19 +18451,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm18, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 9d1939f66219f9..9fd7862fdc368f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1168,7 +1168,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm7
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm8
 ; SSE-NEXT:    movdqa 48(%rsi), %xmm11
-; SSE-NEXT:    movdqa 32(%rdx), %xmm10
 ; SSE-NEXT:    movdqa 48(%rdx), %xmm12
 ; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
@@ -1184,10 +1183,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    pand %xmm2, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa 32(%rdx), %xmm10
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm5, %xmm3
 ; SSE-NEXT:    pandn %xmm1, %xmm3
@@ -2809,7 +2809,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-LABEL: store_i16_stride3_vf64:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
@@ -2830,6 +2829,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vmovdqa 80(%rsi), %xmm5
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
@@ -2953,7 +2953,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-LABEL: store_i16_stride3_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
@@ -2974,6 +2973,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rsi), %xmm5
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
@@ -3107,16 +3107,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
@@ -3139,8 +3136,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3155,6 +3152,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX512-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3165,10 +3163,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3259,16 +3259,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
@@ -3291,8 +3288,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3307,6 +3304,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3317,10 +3315,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512-FCP-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3411,16 +3411,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
@@ -3443,8 +3440,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3459,6 +3456,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3469,10 +3467,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512DQ-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3563,16 +3563,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
@@ -3595,8 +3592,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3611,6 +3608,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3621,10 +3619,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 704c92924abfbf..a52d6cc9bd3b71 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -3074,20 +3074,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride4_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm11
 ; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm6
-; AVX512-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
@@ -3163,11 +3161,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
+; AVX512-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
+; AVX512-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3228,20 +3228,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-FCP-LABEL: store_i16_stride4_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
 ; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
@@ -3317,11 +3315,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3382,20 +3382,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-LABEL: store_i16_stride4_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
@@ -3471,11 +3469,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3536,20 +3536,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride4_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rsi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
@@ -3625,11 +3623,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 7d2f52d3c5830e..994c785126d256 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -1157,12 +1157,11 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
 ; SSE-LABEL: store_i16_stride5_vf16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm15
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm5
 ; SSE-NEXT:    movdqa (%rsi), %xmm8
+; SSE-NEXT:    movdqa (%rcx), %xmm14
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm0
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm10
-; SSE-NEXT:    movdqa (%rcx), %xmm14
 ; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm11
 ; SSE-NEXT:    movdqa 16(%r8), %xmm3
@@ -1179,6 +1178,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0]
 ; SSE-NEXT:    pand %xmm9, %xmm4
+; SSE-NEXT:    movdqa (%rdi), %xmm15
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    movdqa %xmm9, %xmm13
@@ -2489,10 +2489,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm6, %xmm11
-; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
@@ -2836,12 +2836,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-LABEL: store_i16_stride5_vf32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm6
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
 ; AVX2-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm6
 ; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm8
 ; AVX2-NEXT:    vmovdqa (%rcx), %xmm7
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm9
@@ -2852,15 +2851,19 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm10
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm11
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm13
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-NEXT:    vpbroadcastq (%r8), %ymm11
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm11, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
@@ -2870,15 +2873,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq 32(%r8), %ymm5
 ; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-NEXT:    vpshufb %xmm11, %xmm12, %xmm0
 ; AVX2-NEXT:    vpbroadcastq 8(%rdi), %xmm12
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2]
@@ -2894,7 +2894,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15]
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm5
 ; AVX2-NEXT:    vpshufb %xmm11, %xmm10, %xmm10
 ; AVX2-NEXT:    vpbroadcastq 40(%rdi), %xmm11
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
@@ -2907,6 +2906,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1]
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm5
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm13, %ymm14, %ymm9, %ymm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7]
@@ -3566,7 +3566,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpbroadcastq 8(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
@@ -3623,6 +3622,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm5
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
@@ -3880,7 +3880,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq 8(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
@@ -3937,6 +3936,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
@@ -4583,13 +4583,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm12, %xmm11
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    movdqa %xmm14, %xmm12
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
 ; SSE-NEXT:    pand %xmm9, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm14, %xmm12
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm3, %xmm7
 ; SSE-NEXT:    movdqa %xmm3, %xmm14
@@ -4978,10 +4978,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm7, %xmm4
-; SSE-NEXT:    movdqa %xmm0, %xmm13
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
+; SSE-NEXT:    movdqa %xmm0, %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index c725dcd972cd59..b2a270adbb359e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -1711,19 +1711,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-LABEL: store_i16_stride6_vf16:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $24, %rsp
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps (%r9), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm8
-; AVX2-FP-NEXT:    vmovaps (%r9), %ymm3
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm7
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm7
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm9
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm11
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm5
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1]
@@ -2037,15 +2037,16 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
+; AVX512-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
+; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
 ; AVX512-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm11
@@ -2059,7 +2060,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
@@ -2204,15 +2204,16 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
+; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm11
@@ -2226,7 +2227,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
@@ -2418,11 +2418,11 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa (%rcx), %xmm4
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm10
 ; SSE-NEXT:    movdqa (%r8), %xmm8
-; SSE-NEXT:    movdqa (%r9), %xmm11
 ; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    movdqa (%r9), %xmm11
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3]
@@ -5188,11 +5188,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa (%rcx), %xmm6
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm3
 ; SSE-NEXT:    movdqa (%r8), %xmm9
-; SSE-NEXT:    movdqa (%r9), %xmm8
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm10, %xmm7
+; SSE-NEXT:    movdqa (%r9), %xmm8
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3]
@@ -5269,6 +5269,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm6
 ; SSE-NEXT:    movdqa %xmm3, %xmm7
@@ -5291,7 +5292,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    andps %xmm14, %xmm7
 ; SSE-NEXT:    orps %xmm7, %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
@@ -5312,6 +5312,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 48(%rsi), %xmm7
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -5332,7 +5333,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    andps %xmm14, %xmm8
 ; SSE-NEXT:    orps %xmm8, %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
@@ -5353,6 +5353,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 64(%rsi), %xmm8
 ; SSE-NEXT:    movdqa %xmm3, %xmm11
@@ -5373,7 +5374,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    andps %xmm14, %xmm11
 ; SSE-NEXT:    orps %xmm11, %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
@@ -5394,6 +5394,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 80(%rsi), %xmm11
 ; SSE-NEXT:    movdqa %xmm3, %xmm12
@@ -5414,7 +5415,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    andps %xmm14, %xmm12
 ; SSE-NEXT:    orps %xmm12, %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
@@ -6619,17 +6619,17 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqa %xmm2, %xmm14
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqa (%r8), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovdqa 32(%r8), %xmm4
 ; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
 ; AVX2-NEXT:    vmovdqa (%r9), %xmm0
@@ -9244,23 +9244,23 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index dc362d729fcd31..1d1c4de793b6d6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -2288,6 +2288,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
+; AVX2-NEXT:    vpermd %ymm7, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm6
 ; AVX2-NEXT:    vmovdqa (%rdx), %ymm5
 ; AVX2-NEXT:    vmovdqa (%rcx), %ymm13
@@ -2295,8 +2297,6 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX2-NEXT:    vmovdqa (%rax), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
-; AVX2-NEXT:    vpermd %ymm7, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7]
 ; AVX2-NEXT:    vmovdqa %ymm6, %ymm7
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
@@ -3565,27 +3565,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa 48(%rsi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdx), %xmm1
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rcx), %xmm5
 ; SSE-NEXT:    movdqa 48(%r8), %xmm9
-; SSE-NEXT:    movdqa 48(%r9), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 48(%rax), %xmm7
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; SSE-NEXT:    movaps 48(%rax), %xmm7
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, %xmm10
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7]
+; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm5, %xmm11
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm6, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm3, %xmm5
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 48(%r9), %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm5
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
 ; SSE-NEXT:    pand %xmm3, %xmm0
@@ -4798,11 +4798,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqa (%rcx), %ymm14
 ; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm9
-; AVX2-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX2-NEXT:    vmovdqa 32(%r9), %ymm7
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
+; AVX2-NEXT:    vmovdqa 32(%r9), %ymm7
 ; AVX2-NEXT:    vpermd %ymm8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-NEXT:    vmovdqa 32(%r8), %ymm6
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
@@ -4814,7 +4815,6 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
@@ -7719,32 +7719,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rsi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 96(%rdx), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rdx), %xmm1
-; SSE-NEXT:    movdqa 96(%rcx), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rcx), %xmm6
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
 ; SSE-NEXT:    movdqa 112(%r8), %xmm4
-; SSE-NEXT:    movdqa 112(%r9), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 112(%rax), %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
-; SSE-NEXT:    movdqa %xmm1, %xmm10
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE-NEXT:    movdqa %xmm1, %xmm10
 ; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm13, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa 96(%rcx), %xmm12
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE-NEXT:    movdqa 112(%r9), %xmm8
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE-NEXT:    movdqa 96(%rdx), %xmm5
 ; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm4, %xmm9
@@ -7755,7 +7754,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    psrld $16, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm15[0,2]
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; SSE-NEXT:    andps %xmm1, %xmm4
 ; SSE-NEXT:    andnps %xmm7, %xmm1
@@ -8135,15 +8134,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm3
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm3
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm0, %xmm6
@@ -8847,7 +8846,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm10, %xmm13
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1]
 ; SSE-NEXT:    movaps %xmm5, %xmm2
 ; SSE-NEXT:    andnps %xmm1, %xmm2
@@ -8884,6 +8882,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT:    movaps %xmm10, %xmm13
 ; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    psrld $16, %xmm1
@@ -9023,7 +9022,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm13, %xmm7
+; SSE-NEXT:    movaps %xmm13, %xmm7
 ; SSE-NEXT:    shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm7 = xmm7[1],mem[0]
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -9647,13 +9646,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovdqa 48(%r9), %xmm1
 ; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
 ; AVX-NEXT:    vmovdqa 48(%rax), %xmm11
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX-NEXT:    vmovdqa %xmm2, %xmm9
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa %xmm1, %xmm10
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm7, %xmm2
-; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3]
@@ -9790,14 +9788,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vorps %ymm0, %ymm7, %ymm7
 ; AVX-NEXT:    vmovdqa 80(%r9), %xmm0
 ; AVX-NEXT:    vmovdqa 80(%r8), %xmm2
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX-NEXT:    vmovdqa %xmm0, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX-NEXT:    vmovdqa %xmm2, %xmm3
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm0, %xmm4
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa 80(%rax), %xmm2
-; AVX-NEXT:    vmovdqa %xmm6, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3]
@@ -15369,8 +15366,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
+; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -15595,8 +15592,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
+; AVX512BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -15821,8 +15818,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
+; AVX512DQ-BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -16047,8 +16044,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
index e333e47219116d..22508e2ccfc79e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
@@ -1321,14 +1321,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX2-LABEL: store_i32_stride2_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-NEXT:    vmovaps 96(%rdi), %ymm9
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm6
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm8
-; AVX2-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm7
-; AVX2-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-NEXT:    vmovaps 128(%rsi), %ymm12
@@ -1393,14 +1393,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX2-FP-LABEL: store_i32_stride2_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-FP-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-FP-NEXT:    vmovaps 128(%rsi), %ymm12
@@ -1465,14 +1465,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX2-FCP-LABEL: store_i32_stride2_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-FCP-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-FCP-NEXT:    vmovaps 128(%rsi), %ymm12
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index de2e1df4c5566b..7d636b2d8aa3b7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -714,13 +714,13 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps 16(%rsi), %xmm9
 ; SSE-NEXT:    movaps 32(%rsi), %xmm10
 ; SSE-NEXT:    movaps 48(%rsi), %xmm11
-; SSE-NEXT:    movaps 16(%rdx), %xmm0
-; SSE-NEXT:    movaps 32(%rdx), %xmm3
 ; SSE-NEXT:    movaps 48(%rdx), %xmm8
+; SSE-NEXT:    movaps 32(%rdx), %xmm3
 ; SSE-NEXT:    movaps %xmm5, %xmm12
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
 ; SSE-NEXT:    movaps %xmm5, %xmm13
 ; SSE-NEXT:    movaps %xmm5, %xmm6
+; SSE-NEXT:    movaps 16(%rdx), %xmm0
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3]
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 58991d65cf1ee5..ede8586545e496 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -1334,11 +1334,9 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX-LABEL: store_i32_stride5_vf16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vmovaps (%rsi), %xmm6
 ; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovaps 32(%rsi), %ymm3
-; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
-; AVX-NEXT:    vmovaps (%rsi), %xmm6
 ; AVX-NEXT:    vmovaps 32(%rsi), %xmm8
 ; AVX-NEXT:    vmovaps (%rdi), %xmm10
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
@@ -1346,11 +1344,12 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
 ; AVX-NEXT:    vmovaps (%rdx), %xmm11
-; AVX-NEXT:    vmovaps (%rcx), %xmm12
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0]
+; AVX-NEXT:    vmovaps (%rcx), %xmm12
 ; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0]
 ; AVX-NEXT:    vbroadcastss 4(%rdx), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7]
 ; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm5, %ymm5
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7]
@@ -1368,6 +1367,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vmovaps 32(%rcx), %ymm7
 ; AVX-NEXT:    vinsertf128 $1, 32(%r8), %ymm15, %ymm15
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7]
+; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX-NEXT:    vbroadcastss 4(%rcx), %xmm12
@@ -4774,11 +4774,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rcx), %xmm5
 ; SSE-NEXT:    movaps 16(%rcx), %xmm9
-; SSE-NEXT:    movaps 32(%rcx), %xmm12
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%r8), %xmm4
-; SSE-NEXT:    movaps 16(%r8), %xmm11
 ; SSE-NEXT:    movaps 32(%r8), %xmm14
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%r8), %xmm11
 ; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3]
@@ -4789,8 +4788,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3]
+; SSE-NEXT:    movaps 32(%rcx), %xmm12
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -5310,11 +5310,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm2, %xmm1
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE-NEXT:    movaps 240(%rdi), %xmm5
-; SSE-NEXT:    movaps %xmm5, %xmm6
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm5, %xmm6
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1]
 ; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm5, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index e4f616ed730ebf..31d7791d674a44 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -10090,48 +10090,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
+; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
 ; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
-; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -10659,48 +10658,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
 ; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
-; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -11228,48 +11226,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
+; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
 ; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
-; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -11797,48 +11794,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
+; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
-; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 13930bc2c67407..275f36005f1ee0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -1779,24 +1779,24 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $520, %rsp # imm = 0x208
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movdqa (%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
-; SSE-NEXT:    movdqa 16(%rsi), %xmm6
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rdx), %xmm5
+; SSE-NEXT:    movaps (%rcx), %xmm8
+; SSE-NEXT:    movaps 16(%r8), %xmm14
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm9
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps (%rcx), %xmm8
+; SSE-NEXT:    movaps (%r8), %xmm15
+; SSE-NEXT:    movdqa (%r9), %xmm13
 ; SSE-NEXT:    movaps 16(%rcx), %xmm2
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps (%r8), %xmm15
-; SSE-NEXT:    movaps 16(%r8), %xmm14
+; SSE-NEXT:    movdqa (%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%r9), %xmm13
 ; SSE-NEXT:    movdqa 16(%r9), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rax), %xmm11
+; SSE-NEXT:    movdqa 16(%rsi), %xmm6
 ; SSE-NEXT:    movaps %xmm15, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
@@ -1917,11 +1917,11 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm4, %xmm13
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE-NEXT:    movdqa %xmm4, %xmm13
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -4015,36 +4015,35 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa (%rdi), %xmm8
 ; SSE-NEXT:    movdqa (%rsi), %xmm10
-; SSE-NEXT:    movdqa 16(%rsi), %xmm4
 ; SSE-NEXT:    movaps (%rdx), %xmm14
-; SSE-NEXT:    movdqa 16(%rdx), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rcx), %xmm13
-; SSE-NEXT:    movaps 16(%rcx), %xmm9
+; SSE-NEXT:    movdqa 16(%rdx), %xmm7
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%rcx), %xmm9
 ; SSE-NEXT:    movaps (%r8), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 16(%r8), %xmm11
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%r9), %xmm15
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%r9), %xmm12
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rsi), %xmm4
 ; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rax), %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1]
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
 ; SSE-NEXT:    movaps %xmm14, %xmm3
 ; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%r8), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4065,9 +4064,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm1
-; SSE-NEXT:    movaps 32(%rdx), %xmm5
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 32(%rdx), %xmm5
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm1
@@ -4119,9 +4118,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 64(%rsi), %xmm1
-; SSE-NEXT:    movaps 64(%rdx), %xmm3
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 64(%rdx), %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
@@ -6798,8 +6797,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512-NEXT:    kmovw %ecx, %k2
 ; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7001,8 +7000,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512-FCP-NEXT:    kmovw %ecx, %k2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7204,8 +7203,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512DQ-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7407,8 +7406,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512DQ-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-FCP-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7610,8 +7609,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7813,8 +7812,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8016,8 +8015,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512DQ-BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8219,8 +8218,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8381,23 +8380,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movdqa (%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
-; SSE-NEXT:    movdqa 16(%rsi), %xmm3
-; SSE-NEXT:    movaps (%rdx), %xmm2
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rdx), %xmm2
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rsi), %xmm3
 ; SSE-NEXT:    movaps (%rcx), %xmm13
 ; SSE-NEXT:    movaps 16(%rcx), %xmm9
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rdi), %xmm6
 ; SSE-NEXT:    movaps (%r8), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 16(%r8), %xmm10
 ; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%r9), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%r9), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rax), %xmm15
@@ -8434,9 +8433,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm1
-; SSE-NEXT:    movaps 32(%rdx), %xmm3
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 32(%rdx), %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
@@ -8487,9 +8486,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 64(%rsi), %xmm1
-; SSE-NEXT:    movaps 64(%rdx), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 64(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -8540,9 +8539,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rsi), %xmm1
-; SSE-NEXT:    movaps 96(%rdx), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 96(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -8592,9 +8591,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rsi), %xmm1
-; SSE-NEXT:    movaps 128(%rdx), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 128(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -8645,9 +8644,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rsi), %xmm1
-; SSE-NEXT:    movaps 160(%rdx), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 160(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -8698,9 +8697,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rsi), %xmm1
-; SSE-NEXT:    movaps 192(%rdx), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 192(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -9646,14 +9645,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $3432, %rsp # imm = 0xD68
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 224(%rdx), %ymm0
 ; AVX-NEXT:    vmovaps 224(%rsi), %ymm2
+; AVX-NEXT:    vmovaps 224(%r8), %ymm4
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rdx), %ymm0
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 224(%rcx), %ymm5
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%r8), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 224(%rax), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index 265f6daeb20038..bac4ff8ce434d1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -864,7 +864,6 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movaps (%r10), %xmm14
 ; SSE-NEXT:    movaps 16(%r10), %xmm12
 ; SSE-NEXT:    movaps (%rax), %xmm4
-; SSE-NEXT:    movaps 16(%rax), %xmm7
 ; SSE-NEXT:    movaps %xmm4, %xmm2
 ; SSE-NEXT:    movaps %xmm4, %xmm11
 ; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
@@ -872,10 +871,11 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm2
 ; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm4
+; SSE-NEXT:    movaps 16(%rax), %xmm7
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
index a01d4de0027f40..75b76f891d46ca 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
@@ -843,12 +843,12 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    subq $24, %rsp
 ; SSE-NEXT:    movapd 64(%rdi), %xmm5
 ; SSE-NEXT:    movapd (%rdi), %xmm1
-; SSE-NEXT:    movapd 16(%rdi), %xmm2
-; SSE-NEXT:    movapd 32(%rdi), %xmm3
-; SSE-NEXT:    movapd 48(%rdi), %xmm6
-; SSE-NEXT:    movapd 64(%rsi), %xmm9
 ; SSE-NEXT:    movapd (%rsi), %xmm4
 ; SSE-NEXT:    movapd 16(%rsi), %xmm7
+; SSE-NEXT:    movapd 64(%rsi), %xmm9
+; SSE-NEXT:    movapd 48(%rdi), %xmm6
+; SSE-NEXT:    movapd 32(%rdi), %xmm3
+; SSE-NEXT:    movapd 16(%rdi), %xmm2
 ; SSE-NEXT:    movapd 32(%rsi), %xmm11
 ; SSE-NEXT:    movapd 48(%rsi), %xmm10
 ; SSE-NEXT:    movapd 64(%rdx), %xmm15
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index ded7c002c8735b..d610029880f81a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -6745,15 +6745,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -7040,15 +7040,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -7335,15 +7335,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -7630,15 +7630,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -7925,15 +7925,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -8220,15 +8220,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -8515,15 +8515,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm13
@@ -8810,15 +8810,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdx), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index ffdbdea024ea0f..78a8042b3535ef 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -7394,20 +7394,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-LABEL: store_i64_stride5_vf64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $2264, %rsp # imm = 0x8D8
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps 64(%rcx), %ymm1
-; AVX-NEXT:    vmovaps 128(%rcx), %ymm0
 ; AVX-NEXT:    vmovaps (%rcx), %ymm2
+; AVX-NEXT:    vmovaps 128(%rcx), %ymm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
 ; AVX-NEXT:    vmovaps 16(%rdx), %xmm6
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
+; AVX-NEXT:    vmovaps 80(%rdx), %xmm3
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = mem[2,3,2,3]
-; AVX-NEXT:    vmovaps 80(%rdx), %xmm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 89642492f83a85..651f851f9f6f96 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -4443,7 +4443,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
-; AVX512DQ-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
 ; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
@@ -4458,6 +4457,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
@@ -4468,10 +4468,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
-; AVX512DQ-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm27
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
+; AVX512DQ-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-FCP-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
 ; AVX512DQ-FCP-NEXT:    kmovw %esi, %k3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
@@ -4481,7 +4481,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
-; AVX512DQ-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm26
@@ -4501,6 +4500,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
+; AVX512DQ-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
@@ -5267,7 +5267,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
-; AVX512DQ-BW-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
@@ -5282,6 +5281,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
@@ -5292,10 +5292,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %esi, %k3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
@@ -5305,7 +5305,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm26
@@ -5325,6 +5324,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
@@ -16932,58 +16932,58 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm19
-; AVX512-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 (%r9), %zmm15
+; AVX512-NEXT:    kmovw %r10d, %k1
+; AVX512-NEXT:    movb $96, %r10b
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
 ; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm3
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
 ; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    movb $96, %r10b
-; AVX512-NEXT:    kmovw %r10d, %k1
-; AVX512-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm6
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
 ; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
 ; AVX512-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
+; AVX512-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512-NEXT:    movb $28, %r10b
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    kmovw %r10d, %k2
+; AVX512-NEXT:    movb $28, %r10b
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm19
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
@@ -17894,48 +17894,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
 ; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
 ; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movb $96, %r10b
-; AVX512-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
+; AVX512-FCP-NEXT:    kmovw %r10d, %k1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
@@ -17947,7 +17943,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512-FCP-NEXT:    movb $28, %r10b
 ; AVX512-FCP-NEXT:    kmovw %r10d, %k2
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
@@ -18854,36 +18853,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm16
-; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm15
 ; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm15
-; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm14
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    movb $96, %r10b
+; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
@@ -18892,18 +18890,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm16
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-NEXT:    movb $28, %r10b
 ; AVX512DQ-NEXT:    kmovw %r10d, %k2
@@ -19808,37 +19807,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdx), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
@@ -19847,18 +19845,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512DQ-FCP-NEXT:    movb $28, %r10b
 ; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k2
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
@@ -20761,58 +20760,58 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512BW-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm15
+; AVX512BW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-NEXT:    movb $96, %r10b
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm3
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
 ; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    movb $96, %r10b
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm6
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
 ; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
+; AVX512BW-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512BW-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512BW-NEXT:    movb $28, %r10b
+; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
+; AVX512BW-NEXT:    movb $28, %r10b
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
@@ -21723,48 +21722,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
 ; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
 ; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $96, %r10b
-; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
@@ -21776,7 +21771,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512BW-FCP-NEXT:    movb $28, %r10b
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
@@ -22683,36 +22681,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm14
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    movb $96, %r10b
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
@@ -22721,18 +22718,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm16
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-BW-NEXT:    movb $28, %r10b
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
@@ -23637,37 +23635,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdx), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
@@ -23676,18 +23673,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    movb $28, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index e837f14d367b25..5c005567db232c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -18632,16 +18632,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-NEXT:    movb $-64, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -18676,18 +18674,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -19579,16 +19579,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-FCP-NEXT:    movb $-64, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -19623,18 +19621,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -20526,16 +20526,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512DQ-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512DQ-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-NEXT:    movb $-64, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -20570,18 +20568,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512DQ-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -21473,16 +21473,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-FCP-NEXT:    movb $-64, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -21517,18 +21515,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -22420,16 +22420,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-NEXT:    movb $-64, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -22464,18 +22462,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -23367,16 +23367,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-FCP-NEXT:    movb $-64, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -23411,18 +23409,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -24314,16 +24314,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-NEXT:    movb $-64, %r11b
 ; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -24358,18 +24356,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -25261,16 +25261,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
@@ -25305,18 +25303,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 3d26171054f2e4..c1e99368e9201a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -1645,7 +1645,6 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufb %xmm8, %xmm7, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
 ; AVX-NEXT:    vpshufb %xmm8, %xmm9, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm0
@@ -1660,6 +1659,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufb %xmm11, %xmm14, %xmm15
 ; AVX-NEXT:    vpor %xmm2, %xmm15, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
 ; AVX-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX-NEXT:    vpshufb %xmm11, %xmm13, %xmm15
 ; AVX-NEXT:    vpor %xmm6, %xmm15, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 06d390f053c7ee..5e87572af5dc14 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -981,7 +981,6 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa (%rsi), %xmm8
 ; SSE-NEXT:    movdqa (%rdx), %xmm2
 ; SSE-NEXT:    movdqa (%rcx), %xmm4
-; SSE-NEXT:    movdqa (%r8), %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm2, %xmm9
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
@@ -990,13 +989,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    movdqa %xmm6, %xmm5
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pandn %xmm3, %xmm5
 ; SSE-NEXT:    por %xmm1, %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
 ; SSE-NEXT:    pand %xmm2, %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
+; SSE-NEXT:    movdqa (%r8), %xmm0
 ; SSE-NEXT:    movdqa %xmm10, %xmm12
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
@@ -1661,7 +1661,6 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rcx), %xmm11
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm12
 ; SSE-NEXT:    movdqa 16(%r8), %xmm14
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
@@ -1680,8 +1679,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2]
 ; SSE-NEXT:    movdqa %xmm3, %xmm15
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
+; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7]
@@ -1691,6 +1690,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    por %xmm0, %xmm5
 ; SSE-NEXT:    movdqa %xmm8, %xmm0
 ; SSE-NEXT:    pandn %xmm5, %xmm0
+; SSE-NEXT:    movdqa (%rcx), %xmm11
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
 ; SSE-NEXT:    pand %xmm10, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 60af864597f4f7..fbeecbc0a4ab29 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1767,10 +1767,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa %xmm3, %xmm7
 ; SSE-NEXT:    pandn %xmm2, %xmm7
 ; SSE-NEXT:    por %xmm6, %xmm7
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; SSE-NEXT:    pand %xmm2, %xmm7
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE-NEXT:    pand %xmm2, %xmm7
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm6, %xmm1
@@ -2231,11 +2231,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-LABEL: store_i8_stride6_vf32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX2-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa (%r8), %ymm4
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
@@ -2244,6 +2243,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm8
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm9
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm11
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -2365,15 +2365,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-LABEL: store_i8_stride6_vf32:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $40, %rsp
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm7
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2501,15 +2501,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-LABEL: store_i8_stride6_vf32:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $40, %rsp
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm7
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -5451,10 +5451,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5836,9 +5836,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
@@ -5980,10 +5980,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6365,9 +6365,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 416fbe9aa340ca..14e5f65407942e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2683,16 +2683,16 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm4
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm3
-; SSE-NEXT:    movdqa 16(%rcx), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%r8), %xmm6
-; SSE-NEXT:    movdqa 16(%r9), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6]
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r8), %xmm6
 ; SSE-NEXT:    movdqa %xmm1, %xmm15
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r9), %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
 ; SSE-NEXT:    pand %xmm10, %xmm0
+; SSE-NEXT:    movdqa 16(%rcx), %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
 ; SSE-NEXT:    movdqa %xmm4, %xmm8
 ; SSE-NEXT:    movdqa %xmm4, %xmm13
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3129,10 +3129,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
+; SSE-NEXT:    movdqa %xmm13, %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm2, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
-; SSE-NEXT:    movdqa %xmm13, %xmm5
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-NEXT:    pand %xmm2, %xmm3
@@ -4304,7 +4304,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm6
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
@@ -4341,6 +4340,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
 ; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
@@ -4583,7 +4583,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
@@ -4620,6 +4619,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
@@ -5328,9 +5328,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa 48(%rax), %xmm13
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm6, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3]
+; SSE-NEXT:    pand %xmm6, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm2, %xmm11
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -5928,12 +5928,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pandn %xmm10, %xmm15
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4]
-; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm2
 ; SSE-NEXT:    pandn %xmm10, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
-; SSE-NEXT:    pand %xmm0, %xmm10
+; SSE-NEXT:    pand %xmm6, %xmm10
 ; SSE-NEXT:    por %xmm10, %xmm2
 ; SSE-NEXT:    pand %xmm12, %xmm2
 ; SSE-NEXT:    por %xmm15, %xmm2
@@ -5942,9 +5941,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm0, %xmm15
 ; SSE-NEXT:    pandn %xmm10, %xmm15
-; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
+; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm10
 ; SSE-NEXT:    por %xmm10, %xmm15
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
@@ -6311,7 +6310,6 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa %xmm12, %xmm10
 ; SSE-NEXT:    pandn %xmm8, %xmm10
 ; SSE-NEXT:    pand %xmm12, %xmm6
-; SSE-NEXT:    movdqa %xmm12, %xmm4
 ; SSE-NEXT:    por %xmm6, %xmm10
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -6323,13 +6321,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    movdqa %xmm11, %xmm8
 ; SSE-NEXT:    pandn %xmm0, %xmm8
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm0, %xmm8
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm3, %xmm10
@@ -6378,17 +6375,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
 ; SSE-NEXT:    pand %xmm7, %xmm0
 ; SSE-NEXT:    por %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm12, %xmm14
 ; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
 ; SSE-NEXT:    movdqa %xmm12, %xmm8
 ; SSE-NEXT:    pandn %xmm0, %xmm8
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4]
-; SSE-NEXT:    movdqa %xmm4, %xmm14
-; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm14, %xmm0
 ; SSE-NEXT:    pandn %xmm10, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
-; SSE-NEXT:    pand %xmm4, %xmm10
+; SSE-NEXT:    pand %xmm14, %xmm10
 ; SSE-NEXT:    por %xmm10, %xmm0
 ; SSE-NEXT:    pand %xmm12, %xmm0
 ; SSE-NEXT:    movdqa %xmm12, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 311166ef60dda0..39b012bcf8d4e4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -3024,7 +3024,6 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovaps 16(%rsi), %xmm8
 ; AVX2-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15]
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3036,13 +3035,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm8
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm9
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm13, %ymm13
 ; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm15
@@ -3052,12 +3051,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm8
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll
index ea4a339d9d5e86..6441a83a4e326b 100644
--- a/llvm/test/CodeGen/X86/vector-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll
@@ -238,14 +238,14 @@ define void @b(ptr %p, ptr %q) nounwind {
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    movaps 32(%rdi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps 48(%rdi), %xmm2
-; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movaps (%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps 16(%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps 48(%rdi), %xmm2
 ; CHECK-NEXT:    movaps 32(%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps 48(%rsi), %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index dc9e69137a8a7e..c58c4614022c63 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3320,27 +3320,24 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pslld $31, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_4i1_to_4i64:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pslld $31, %xmm0
 ; SSSE3-NEXT:    psrad $31, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_4i1_to_4i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pslld $31, %xmm0
 ; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: sext_4i1_to_4i64:
@@ -3370,18 +3367,16 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pslld $31, %xmm0
 ; X86-SSE2-NEXT:    psrad $31, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: sext_4i1_to_4i64:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    pslld $31, %xmm0
 ; X86-SSE41-NEXT:    psrad $31, %xmm0
-; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
 ; X86-SSE41-NEXT:    retl
   %extmask = sext <4 x i1> %mask to <4 x i64>
   ret <4 x i64> %extmask
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 81ce14132c8799..d6171235aa2c46 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -685,11 +685,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    vmovaps 8(%ebp), %ymm5
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm5[2,3]
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
-; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2]
-; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2]
 ; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; X86-AVX1-NEXT:    vmovaps %ymm4, %ymm1
 ; X86-AVX1-NEXT:    movl %ebp, %esp
@@ -714,10 +714,10 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm5
 ; X86-AVX2-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
 ; X86-AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; X86-AVX2-NEXT:    vmovaps %ymm5, %ymm0
 ; X86-AVX2-NEXT:    vmovaps %ymm4, %ymm1
@@ -739,11 +739,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
-; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
-; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; X64-AVX1-NEXT:    vmovaps %ymm4, %ymm1
 ; X64-AVX1-NEXT:    vmovaps %ymm5, %ymm3
@@ -754,22 +754,21 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
 ; X64-AVX2-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
-; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
 ; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
 ; X64-AVX2-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm3[0]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
 ; X64-AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
 ; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; X64-AVX2-NEXT:    vmovaps %ymm6, %ymm0
 ; X64-AVX2-NEXT:    vmovaps %ymm4, %ymm1
-; X64-AVX2-NEXT:    vmovaps %ymm5, %ymm3
 ; X64-AVX2-NEXT:    retq
   %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 2, i32 3, i32 6, i32 7, i32 8, i32 9, i32 12, i32 13, i32 10, i32 11, i32 14, i32 15>
   %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 74926f46ffa437..90b6beeae516de 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -127,10 +127,10 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
@@ -584,10 +584,10 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
@@ -881,10 +881,10 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc87970..35b90a4b2205f2 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -3220,8 +3220,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 49947eddc61b9d..bf330de825966a 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1275,17 +1275,17 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm11
-; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
-; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm13
 ; AVX1-NEXT:    vmovups 64(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm13
 ; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm5
-; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
 ; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm10
+; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm2
+; AVX1-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm5, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm10, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2169b39b9dfa05..150385ffd8aa8c 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -63,9 +63,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb %sil
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8:
@@ -83,9 +82,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    imulb %dl
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi8:
@@ -93,9 +91,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -211,14 +208,14 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %eax, %ecx
@@ -290,9 +287,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb %sil
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8:
@@ -310,9 +306,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mulb %dl
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi8:
@@ -320,9 +315,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -338,9 +332,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw %si
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16:
@@ -359,9 +352,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mulw %dx
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movw %ax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi16:
@@ -370,9 +362,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -388,9 +379,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    mull %esi
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movl %eax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32:
@@ -408,9 +398,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mull %edx
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movl %eax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi32:
@@ -419,9 +408,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -437,9 +425,8 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rdi, %rax
 ; SDAG-NEXT:    mulq %rsi
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movq %rax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64:
@@ -457,9 +444,8 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %rdx
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movq %rax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64:
@@ -985,14 +971,14 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    addl %eax, %ecx
@@ -1399,9 +1385,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %esi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb (%rdi)
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8_load:
@@ -1418,9 +1403,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %edx, %eax
 ; WIN64-NEXT:    imulb (%rcx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi8_load:
@@ -1429,9 +1413,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzbl (%eax), %eax
 ; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i8, ptr %ptr1
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1447,9 +1430,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb (%rsi)
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8_load2:
@@ -1467,9 +1449,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    imulb (%rdx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi8_load2:
@@ -1478,9 +1459,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imulb (%ecx)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i8, ptr %ptr2
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1869,9 +1849,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %esi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb (%rdi)
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8_load:
@@ -1888,9 +1867,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %edx, %eax
 ; WIN64-NEXT:    mulb (%rcx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi8_load:
@@ -1899,9 +1877,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzbl (%eax), %eax
 ; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i8, ptr %ptr1
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1917,9 +1894,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb (%rsi)
-; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
-; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8_load2:
@@ -1937,9 +1913,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mulb (%rdx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movb %al, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi8_load2:
@@ -1948,9 +1923,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mulb (%ecx)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i8, ptr %ptr2
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1967,9 +1941,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
 ; SDAG-NEXT:    movl %esi, %eax
 ; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw (%rdi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16_load:
@@ -1987,9 +1960,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %edx, %eax
 ; WIN64-NEXT:    mulw (%rcx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movw %ax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi16_load:
@@ -1999,9 +1971,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzwl (%eax), %eax
 ; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %v1 = load i16, ptr %ptr1
@@ -2019,9 +1990,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw (%rsi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16_load2:
@@ -2040,9 +2010,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mulw (%rdx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movw %ax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi16_load2:
@@ -2052,9 +2021,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mulw (%ecx)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %v2 = load i16, ptr %ptr2
@@ -2071,9 +2039,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %esi, %eax
 ; SDAG-NEXT:    mull (%rdi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movl %eax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32_load:
@@ -2091,9 +2058,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %edx, %eax
 ; WIN64-NEXT:    mull (%rcx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movl %eax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi32_load:
@@ -2103,9 +2069,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %v1 = load i32, ptr %ptr1
@@ -2122,9 +2087,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    mull (%rsi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movl %eax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32_load2:
@@ -2142,9 +2106,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movl %ecx, %eax
 ; WIN64-NEXT:    mull (%rdx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movl %eax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi32_load2:
@@ -2154,9 +2117,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull (%ecx)
-; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %v2 = load i32, ptr %ptr2
@@ -2173,9 +2135,8 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rsi, %rax
 ; SDAG-NEXT:    mulq (%rdi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movq %rax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64_load:
@@ -2193,9 +2154,8 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movq %rdx, %rax
 ; WIN64-NEXT:    mulq (%rcx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movq %rax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load:
@@ -2250,9 +2210,8 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rdi, %rax
 ; SDAG-NEXT:    mulq (%rsi)
-; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movq %rax, (%rcx)
-; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64_load2:
@@ -2270,9 +2229,8 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq (%rdx)
-; WIN64-NEXT:    seto %cl
 ; WIN64-NEXT:    movq %rax, (%r8)
-; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    seto %al
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load2:

>From ccd00cf0935ae4d469737557d1b79d4484595842 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Wed, 21 Aug 2024 22:02:17 +0200
Subject: [PATCH 02/15] Only move instructions if a potential copy would be
 possible

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   |   19 +-
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         |   12 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |   10 +-
 llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll       |    3 +-
 .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll     | 2603 -----------------
 .../rvv/fixed-vectors-deinterleave-load.ll    |    3 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   24 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |    3 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |    2 +-
 llvm/test/CodeGen/RISCV/xtheadmemidx.ll       |    3 +-
 10 files changed, 40 insertions(+), 2642 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 493d7cd7d8c920..7c5bf9b5c3facd 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -121,7 +121,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs {
   }
 };
 
-static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
+static std::optional<llvm::SmallVector<MachineInstr *>> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
                                                SUnit *Src,
                                                ScheduleDAGMCP &DG) {
   MachineInstr *DstInstr = Dst->getInstr();
@@ -129,7 +129,7 @@ static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
   MachineBasicBlock *MBB = SrcInstr->getParent();
 
   if (DstInstr == nullptr || SrcInstr == nullptr)
-    return false;
+    return {};
   assert("This function only operates on a basic block level." &&
          MBB == SrcInstr->getParent());
 
@@ -199,19 +199,20 @@ static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
     const auto *Current = Edges.front();
     Edges.pop();
     if (!ProcessSNodeChildren(Edges, Current, false))
-      return false;
+      return {};
   }
 
   // If all of the dependencies were deemed valid during the BFS then we
   // are moving them before the copy source here keeping their relative
   // order to each other.
+  llvm::SmallVector<MachineInstr *> InstructionsToMove;
   auto CurrentInst = SrcInstr->getIterator();
   for (int I = 0; I < SectionSize; I++) {
     if (SectionInstr[I])
-      MBB->splice(SrcInstr->getIterator(), MBB, CurrentInst->getIterator());
+      InstructionsToMove.push_back(&(*CurrentInst));
     ++CurrentInst;
   }
-  return true;
+  return InstructionsToMove;
 }
 
 static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
@@ -1161,6 +1162,7 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
   if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies())
     return;
 
+  std::optional<llvm::SmallVector<MachineInstr *>> InstructionsToMove = {};
   for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
        ++OpIdx) {
     MachineOperand &MODef = MI.getOperand(OpIdx);
@@ -1202,7 +1204,8 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
       SUnit *DstSUnit = DG.getSUnit(Copy);
       SUnit *SrcSUnit = DG.getSUnit(&MI);
 
-      if (!moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG))
+      InstructionsToMove = moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG);
+      if (!InstructionsToMove)
         continue;
     }
 
@@ -1234,6 +1237,10 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
       MaybeDeadCopies.insert(Copy);
       Changed = true;
       ++NumCopyBackwardPropagated;
+    } else if (InstructionsToMove) {
+      for (auto *I : *InstructionsToMove) {
+        MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator());
+      }
     }
   }
 }
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 23c885a1d2cb62..eed4e030f6720f 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -641,8 +641,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -691,8 +691,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    mv a2, a1
+; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    mv a1, sp
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
@@ -923,8 +923,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -965,8 +965,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    mv a2, a1
+; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    mv a1, sp
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
@@ -1171,8 +1171,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    mv s3, a0
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
@@ -1212,8 +1212,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64IZFINXZDINX-NEXT:    mv s0, a4
 ; RV64IZFINXZDINX-NEXT:    mv s1, a3
 ; RV64IZFINXZDINX-NEXT:    mv s2, a2
-; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    mv a2, a1
+; RV64IZFINXZDINX-NEXT:    mv s3, a0
 ; RV64IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV64IZFINXZDINX-NEXT:    mv a0, a2
 ; RV64IZFINXZDINX-NEXT:    call frexpf
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 67143336de477b..5efcb623aa209d 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -687,12 +687,11 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
 ; RV32-NEXT:    seqz a4, a0
-; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt:
@@ -715,12 +714,11 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
 ; RV32-NEXT:    seqz a4, a0
-; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    snez a0, a3
+; RV32-NEXT:    sub a1, a1, a4
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt_dom:
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
index 5fe5a0eda46b84..c74cd48d75a654 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
@@ -156,9 +156,8 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    sll a0, a0, a2
 ; CHECK-NEXT:    srli a4, a4, 1
 ; CHECK-NEXT:    srl a1, a4, a6
-; CHECK-NEXT:    or a3, a5, a3
 ; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    or a0, a5, a3
 ; CHECK-NEXT:    ret
   %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
   ret i64 %or
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
deleted file mode 100644
index 15aa11670e1263..00000000000000
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
+++ /dev/null
@@ -1,2603 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs \
-; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs \
-; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZBA
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs \
-; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZICOND
-
-;
-; Get the actual value of the overflow bit.
-;
-define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: saddo1.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a3, a0, a1
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    xor a3, a1, a3
-; RV64-NEXT:    snez a0, a3
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo1.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a3, a0, a1
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    xor a3, a1, a3
-; RV64ZBA-NEXT:    snez a0, a3
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo1.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a3, a0, a1
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    xor a3, a1, a3
-; RV64ZICOND-NEXT:    snez a0, a3
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-; Test the immediate version.
-define zeroext i1 @saddo2.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: saddo2.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, 4
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo2.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, 4
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo2.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, 4
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-; Test negative immediates.
-define zeroext i1 @saddo3.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: saddo3.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, -4
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo3.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, -4
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo3.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, -4
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-; Test immediates that are too large to be encoded.
-define zeroext i1 @saddo4.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: saddo4.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a2, 4096
-; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    addw a2, a0, a2
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo4.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    lui a2, 4096
-; RV64ZBA-NEXT:    addi a2, a2, -1
-; RV64ZBA-NEXT:    addw a2, a0, a2
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo4.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    lui a2, 4096
-; RV64ZICOND-NEXT:    addi a2, a2, -1
-; RV64ZICOND-NEXT:    addw a2, a0, a2
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: saddo1.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    slt a0, a3, a0
-; RV64-NEXT:    slti a1, a1, 0
-; RV64-NEXT:    xor a0, a1, a0
-; RV64-NEXT:    sd a3, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo1.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a3, a0, a1
-; RV64ZBA-NEXT:    slt a0, a3, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
-; RV64ZBA-NEXT:    xor a0, a1, a0
-; RV64ZBA-NEXT:    sd a3, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo1.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a3, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a3, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
-; RV64ZICOND-NEXT:    xor a0, a1, a0
-; RV64ZICOND-NEXT:    sd a3, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
-; RV64-LABEL: saddo2.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 4
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo2.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 4
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo2.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 4
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
-; RV64-LABEL: saddo3.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, -4
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo3.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, -4
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo3.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, -4
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: uaddo.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
-; RV64-LABEL: uaddo.i32.constant:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, -2
-; RV64-NEXT:    sltu a0, a2, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i32.constant:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, -2
-; RV64ZBA-NEXT:    sltu a0, a2, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i32.constant:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, -2
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i32.constant_one(i32 signext %v1, ptr %res) {
-; RV64-LABEL: uaddo.i32.constant_one:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, 1
-; RV64-NEXT:    seqz a0, a2
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i32.constant_one:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, 1
-; RV64ZBA-NEXT:    seqz a0, a2
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i32.constant_one:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, 1
-; RV64ZICOND-NEXT:    seqz a0, a2
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: uaddo.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    sd a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
-; RV64ZBA-NEXT:    sd a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i64.constant_one(i64 %v1, ptr %res) {
-; RV64-LABEL: uaddo.i64.constant_one:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 1
-; RV64-NEXT:    seqz a0, a2
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64.constant_one:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 1
-; RV64ZBA-NEXT:    seqz a0, a2
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64.constant_one:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 1
-; RV64ZICOND-NEXT:    seqz a0, a2
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: ssubo1.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a3, a0, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    xor a3, a1, a3
-; RV64-NEXT:    snez a0, a3
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo1.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a3, a0, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    xor a3, a1, a3
-; RV64ZBA-NEXT:    snez a0, a3
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo1.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a3, a0, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    xor a3, a1, a3
-; RV64ZICOND-NEXT:    snez a0, a3
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @ssubo2.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: ssubo2.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, 4
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo2.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, 4
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo2.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, 4
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: ssubo.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a3, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
-; RV64-NEXT:    xor a0, a3, a0
-; RV64-NEXT:    sd a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a3, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
-; RV64ZBA-NEXT:    xor a0, a3, a0
-; RV64ZBA-NEXT:    sd a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a3, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
-; RV64ZICOND-NEXT:    xor a0, a3, a0
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: usubo.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
-; RV64-LABEL: usubo.i32.constant.rhs:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a2, a0, 2
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.i32.constant.rhs:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addiw a2, a0, 2
-; RV64ZBA-NEXT:    sltu a0, a0, a2
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addiw a2, a0, 2
-; RV64ZICOND-NEXT:    sltu a0, a0, a2
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
-; RV64-LABEL: usubo.i32.constant.lhs:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a2, -2
-; RV64-NEXT:    subw a2, a2, a0
-; RV64-NEXT:    addi a0, a2, 1
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.i32.constant.lhs:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    li a2, -2
-; RV64ZBA-NEXT:    subw a2, a2, a0
-; RV64ZBA-NEXT:    addi a0, a2, 1
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.i32.constant.lhs:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a2, -2
-; RV64ZICOND-NEXT:    subw a2, a2, a0
-; RV64ZICOND-NEXT:    addi a0, a2, 1
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: usubo.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    sd a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    sd a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: smulo.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulw a3, a0, a1
-; RV64-NEXT:    mul a1, a0, a1
-; RV64-NEXT:    xor a3, a1, a3
-; RV64-NEXT:    snez a0, a3
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulw a3, a0, a1
-; RV64ZBA-NEXT:    mul a1, a0, a1
-; RV64ZBA-NEXT:    xor a3, a1, a3
-; RV64ZBA-NEXT:    snez a0, a3
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulw a3, a0, a1
-; RV64ZICOND-NEXT:    mul a1, a0, a1
-; RV64ZICOND-NEXT:    xor a3, a1, a3
-; RV64ZICOND-NEXT:    snez a0, a3
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: smulo2.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    mulw a3, a0, a2
-; RV64-NEXT:    mul a2, a0, a2
-; RV64-NEXT:    xor a3, a2, a3
-; RV64-NEXT:    snez a0, a3
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo2.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sh1add a2, a0, a0
-; RV64ZBA-NEXT:    sh2add a2, a2, a0
-; RV64ZBA-NEXT:    sext.w a0, a2
-; RV64ZBA-NEXT:    xor a0, a2, a0
-; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo2.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a2, 13
-; RV64ZICOND-NEXT:    mulw a3, a0, a2
-; RV64ZICOND-NEXT:    mul a2, a0, a2
-; RV64ZICOND-NEXT:    xor a3, a2, a3
-; RV64ZICOND-NEXT:    snez a0, a3
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: smulo.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulh a3, a0, a1
-; RV64-NEXT:    mul a1, a0, a1
-; RV64-NEXT:    srai a0, a1, 63
-; RV64-NEXT:    xor a0, a3, a0
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    sd a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulh a3, a0, a1
-; RV64ZBA-NEXT:    mul a1, a0, a1
-; RV64ZBA-NEXT:    srai a0, a1, 63
-; RV64ZBA-NEXT:    xor a0, a3, a0
-; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sd a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulh a3, a0, a1
-; RV64ZICOND-NEXT:    mul a1, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a1, 63
-; RV64ZICOND-NEXT:    xor a0, a3, a0
-; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
-; RV64-LABEL: smulo2.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    mulh a3, a0, a2
-; RV64-NEXT:    mul a2, a0, a2
-; RV64-NEXT:    srai a0, a2, 63
-; RV64-NEXT:    xor a0, a3, a0
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo2.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    li a2, 13
-; RV64ZBA-NEXT:    mulh a2, a0, a2
-; RV64ZBA-NEXT:    sh1add a3, a0, a0
-; RV64ZBA-NEXT:    sh2add a3, a3, a0
-; RV64ZBA-NEXT:    srai a0, a3, 63
-; RV64ZBA-NEXT:    xor a0, a2, a0
-; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sd a3, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo2.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a2, 13
-; RV64ZICOND-NEXT:    mulh a3, a0, a2
-; RV64ZICOND-NEXT:    mul a2, a0, a2
-; RV64ZICOND-NEXT:    srai a0, a2, 63
-; RV64ZICOND-NEXT:    xor a0, a3, a0
-; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
-; RV64-LABEL: umulo.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    mulhu a1, a0, a1
-; RV64-NEXT:    srai a0, a1, 32
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    zext.w a1, a1
-; RV64ZBA-NEXT:    zext.w a0, a0
-; RV64ZBA-NEXT:    mul a1, a0, a1
-; RV64ZBA-NEXT:    srai a0, a1, 32
-; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sw a1, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    slli a1, a1, 32
-; RV64ZICOND-NEXT:    slli a0, a0, 32
-; RV64ZICOND-NEXT:    mulhu a1, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a1, 32
-; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
-; RV64-LABEL: umulo2.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a2, 13
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    mulhu a2, a0, a2
-; RV64-NEXT:    srli a0, a2, 32
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    sw a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo2.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    zext.w a2, a0
-; RV64ZBA-NEXT:    sh1add.uw a0, a0, a2
-; RV64ZBA-NEXT:    sh2add a2, a0, a2
-; RV64ZBA-NEXT:    srli a0, a2, 32
-; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sw a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo2.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a2, 13
-; RV64ZICOND-NEXT:    slli a2, a2, 32
-; RV64ZICOND-NEXT:    slli a0, a0, 32
-; RV64ZICOND-NEXT:    mulhu a2, a0, a2
-; RV64ZICOND-NEXT:    srli a0, a2, 32
-; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    sw a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, ptr %res
-  ret i1 %obit
-}
-
-; Similar to umulo.i32, but storing the overflow and returning the result.
-define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
-; RV64-LABEL: umulo3.i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    srai a1, a0, 32
-; RV64-NEXT:    snez a1, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    sw a1, 0(a2)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo3.i32:
-; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    zext.w a1, a1
-; RV64ZBA-NEXT:    zext.w a0, a0
-; RV64ZBA-NEXT:    mul a3, a0, a1
-; RV64ZBA-NEXT:    srai a3, a3, 32
-; RV64ZBA-NEXT:    snez a3, a3
-; RV64ZBA-NEXT:    mulw a0, a0, a1
-; RV64ZBA-NEXT:    sw a3, 0(a2)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo3.i32:
-; RV64ZICOND:       # %bb.0:
-; RV64ZICOND-NEXT:    slli a1, a1, 32
-; RV64ZICOND-NEXT:    slli a0, a0, 32
-; RV64ZICOND-NEXT:    mulhu a0, a0, a1
-; RV64ZICOND-NEXT:    srai a1, a0, 32
-; RV64ZICOND-NEXT:    snez a1, a1
-; RV64ZICOND-NEXT:    sext.w a0, a0
-; RV64ZICOND-NEXT:    sw a1, 0(a2)
-; RV64ZICOND-NEXT:    ret
-  %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1)
-  %5 = extractvalue { i32, i1 } %4, 1
-  %6 = extractvalue { i32, i1 } %4, 0
-  %7 = zext i1 %5 to i32
-  store i32 %7, ptr %2, align 4
-  ret i32 %6
-}
-
-define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
-; RV64-LABEL: umulo.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulhu a3, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    sd a0, 0(a2)
-; RV64-NEXT:    snez a0, a3
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulhu a3, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    sd a0, 0(a2)
-; RV64ZBA-NEXT:    snez a0, a3
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulhu a3, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    sd a0, 0(a2)
-; RV64ZICOND-NEXT:    snez a0, a3
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
-; RV64-LABEL: umulo2.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a3, 13
-; RV64-NEXT:    mulhu a2, a0, a3
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    sd a0, 0(a1)
-; RV64-NEXT:    snez a0, a2
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo2.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    li a2, 13
-; RV64ZBA-NEXT:    mulhu a2, a0, a2
-; RV64ZBA-NEXT:    sh1add a3, a0, a0
-; RV64ZBA-NEXT:    sh2add a0, a3, a0
-; RV64ZBA-NEXT:    sd a0, 0(a1)
-; RV64ZBA-NEXT:    snez a0, a2
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo2.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a3, 13
-; RV64ZICOND-NEXT:    mulhu a2, a0, a3
-; RV64ZICOND-NEXT:    mul a0, a0, a3
-; RV64ZICOND-NEXT:    sd a0, 0(a1)
-; RV64ZICOND-NEXT:    snez a0, a2
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-
-;
-; Check the use of the overflow bit in combination with a select instruction.
-;
-define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: saddo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a2, a0, a1
-; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    bne a3, a2, .LBB28_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB28_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a2, a0, a1
-; RV64ZBA-NEXT:    add a3, a0, a1
-; RV64ZBA-NEXT:    bne a3, a2, .LBB28_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB28_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a2, a0, a1
-; RV64ZICOND-NEXT:    add a3, a0, a1
-; RV64ZICOND-NEXT:    xor a2, a3, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: saddo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a2, a0, a1
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a2, a0, a1
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    xor a0, a0, a2
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a2, a0, a1
-; RV64ZICOND-NEXT:    add a0, a0, a1
-; RV64ZICOND-NEXT:    xor a0, a0, a2
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: saddo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    slt a2, a2, a0
-; RV64-NEXT:    slti a3, a1, 0
-; RV64-NEXT:    bne a3, a2, .LBB30_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB30_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a2, a0, a1
-; RV64ZBA-NEXT:    slt a2, a2, a0
-; RV64ZBA-NEXT:    slti a3, a1, 0
-; RV64ZBA-NEXT:    bne a3, a2, .LBB30_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB30_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    slt a2, a2, a0
-; RV64ZICOND-NEXT:    slti a3, a1, 0
-; RV64ZICOND-NEXT:    xor a2, a3, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: saddo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    slti a1, a1, 0
-; RV64-NEXT:    xor a0, a1, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a2, a0, a1
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
-; RV64ZBA-NEXT:    xor a0, a1, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
-; RV64ZICOND-NEXT:    xor a0, a1, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: uaddo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a2, a0, a1
-; RV64-NEXT:    bltu a2, a0, .LBB32_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB32_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a2, a0, a1
-; RV64ZBA-NEXT:    bltu a2, a0, .LBB32_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB32_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a2, a0
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: uaddo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: uaddo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    bltu a2, a0, .LBB34_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB34_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a2, a0, a1
-; RV64ZBA-NEXT:    bltu a2, a0, .LBB34_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB34_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a2, a0
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: uaddo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: ssubo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a2, a0, a1
-; RV64-NEXT:    sub a3, a0, a1
-; RV64-NEXT:    bne a3, a2, .LBB36_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB36_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a2, a0, a1
-; RV64ZBA-NEXT:    sub a3, a0, a1
-; RV64ZBA-NEXT:    bne a3, a2, .LBB36_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB36_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a2, a0, a1
-; RV64ZICOND-NEXT:    sub a3, a0, a1
-; RV64ZICOND-NEXT:    xor a2, a3, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: ssubo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a2, a0, a1
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a2, a0, a1
-; RV64ZBA-NEXT:    sub a0, a0, a1
-; RV64ZBA-NEXT:    xor a0, a0, a2
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a2, a0, a1
-; RV64ZICOND-NEXT:    sub a0, a0, a1
-; RV64ZICOND-NEXT:    xor a0, a0, a2
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: ssubo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
-; RV64-NEXT:    sub a3, a0, a1
-; RV64-NEXT:    slt a3, a3, a0
-; RV64-NEXT:    bne a2, a3, .LBB38_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB38_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
-; RV64ZBA-NEXT:    sub a3, a0, a1
-; RV64ZBA-NEXT:    slt a3, a3, a0
-; RV64ZBA-NEXT:    bne a2, a3, .LBB38_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB38_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
-; RV64ZICOND-NEXT:    sub a3, a0, a1
-; RV64ZICOND-NEXT:    slt a3, a3, a0
-; RV64ZICOND-NEXT:    xor a2, a2, a3
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: ssub.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
-; RV64-NEXT:    xor a0, a2, a0
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssub.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
-; RV64ZBA-NEXT:    xor a0, a2, a0
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssub.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
-; RV64ZICOND-NEXT:    xor a0, a2, a0
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: usubo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a2, a0, a1
-; RV64-NEXT:    bltu a0, a2, .LBB40_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB40_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a2, a0, a1
-; RV64ZBA-NEXT:    bltu a0, a2, .LBB40_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB40_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a0, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: usubo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: usubo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    bltu a0, a2, .LBB42_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB42_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a2, a0, a1
-; RV64ZBA-NEXT:    bltu a0, a2, .LBB42_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB42_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a0, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: usubo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    xori a0, a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    xori a0, a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    xori a0, a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i32 @smulo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: smulo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulw a2, a0, a1
-; RV64-NEXT:    mul a3, a0, a1
-; RV64-NEXT:    bne a3, a2, .LBB44_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB44_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulw a2, a0, a1
-; RV64ZBA-NEXT:    mul a3, a0, a1
-; RV64ZBA-NEXT:    bne a3, a2, .LBB44_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB44_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulw a2, a0, a1
-; RV64ZICOND-NEXT:    mul a3, a0, a1
-; RV64ZICOND-NEXT:    xor a2, a3, a2
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: smulo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulw a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulw a2, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    xor a0, a0, a2
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulw a2, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    xor a0, a0, a2
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: smulo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulh a2, a0, a1
-; RV64-NEXT:    mul a3, a0, a1
-; RV64-NEXT:    srai a3, a3, 63
-; RV64-NEXT:    bne a2, a3, .LBB46_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB46_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulh a2, a0, a1
-; RV64ZBA-NEXT:    mul a3, a0, a1
-; RV64ZBA-NEXT:    srai a3, a3, 63
-; RV64ZBA-NEXT:    bne a2, a3, .LBB46_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB46_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulh a2, a0, a1
-; RV64ZICOND-NEXT:    mul a3, a0, a1
-; RV64ZICOND-NEXT:    srai a3, a3, 63
-; RV64ZICOND-NEXT:    xor a2, a2, a3
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: smulo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulh a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    xor a0, a2, a0
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulh a2, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    srai a0, a0, 63
-; RV64ZBA-NEXT:    xor a0, a2, a0
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulh a2, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a0, 63
-; RV64ZICOND-NEXT:    xor a0, a2, a0
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i32 @umulo.select.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: umulo.select.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    slli a3, a0, 32
-; RV64-NEXT:    mulhu a2, a3, a2
-; RV64-NEXT:    srai a2, a2, 32
-; RV64-NEXT:    bnez a2, .LBB48_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB48_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.select.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    zext.w a2, a1
-; RV64ZBA-NEXT:    zext.w a3, a0
-; RV64ZBA-NEXT:    mul a2, a3, a2
-; RV64ZBA-NEXT:    srai a2, a2, 32
-; RV64ZBA-NEXT:    bnez a2, .LBB48_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB48_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.select.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    slli a2, a1, 32
-; RV64ZICOND-NEXT:    slli a3, a0, 32
-; RV64ZICOND-NEXT:    mulhu a2, a3, a2
-; RV64ZICOND-NEXT:    srai a2, a2, 32
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: umulo.not.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 32
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.not.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    zext.w a1, a1
-; RV64ZBA-NEXT:    zext.w a0, a0
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    srai a0, a0, 32
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.not.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    slli a1, a1, 32
-; RV64ZICOND-NEXT:    slli a0, a0, 32
-; RV64ZICOND-NEXT:    mulhu a0, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a0, 32
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: umulo.select.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulhu a2, a0, a1
-; RV64-NEXT:    bnez a2, .LBB50_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB50_2: # %entry
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.select.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulhu a2, a0, a1
-; RV64ZBA-NEXT:    bnez a2, .LBB50_2
-; RV64ZBA-NEXT:  # %bb.1: # %entry
-; RV64ZBA-NEXT:    mv a0, a1
-; RV64ZBA-NEXT:  .LBB50_2: # %entry
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.select.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulhu a2, a0, a1
-; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV64ZICOND-NEXT:    or a0, a0, a1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: umulo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.not.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulhu a0, a0, a1
-; RV64ZBA-NEXT:    seqz a0, a0
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.not.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulhu a0, a0, a1
-; RV64ZICOND-NEXT:    seqz a0, a0
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = xor i1 %obit, true
-  ret i1 %ret
-}
-
-
-;
-; Check the use of the overflow bit in combination with a branch instruction.
-;
-define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: saddo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a2, a0, a1
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    beq a0, a2, .LBB52_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB52_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a2, a0, a1
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    beq a0, a2, .LBB52_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB52_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a2, a0, a1
-; RV64ZICOND-NEXT:    add a0, a0, a1
-; RV64ZICOND-NEXT:    beq a0, a2, .LBB52_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB52_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: saddo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    slti a1, a1, 0
-; RV64-NEXT:    beq a1, a0, .LBB53_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB53_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: saddo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a2, a0, a1
-; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
-; RV64ZBA-NEXT:    beq a1, a0, .LBB53_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB53_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: saddo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
-; RV64ZICOND-NEXT:    beq a1, a0, .LBB53_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB53_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
-; RV64-LABEL: uaddo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a1, a0, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    bgeu a1, a0, .LBB54_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB54_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a1, a0, a1
-; RV64ZBA-NEXT:    sext.w a0, a0
-; RV64ZBA-NEXT:    bgeu a1, a0, .LBB54_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB54_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addw a1, a0, a1
-; RV64ZICOND-NEXT:    sext.w a0, a0
-; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB54_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB54_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: uaddo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    bgeu a1, a0, .LBB55_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB55_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a1, a0, .LBB55_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB55_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB55_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB55_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: ssubo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a2, a0, a1
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    beq a0, a2, .LBB56_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB56_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a2, a0, a1
-; RV64ZBA-NEXT:    sub a0, a0, a1
-; RV64ZBA-NEXT:    beq a0, a2, .LBB56_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB56_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a2, a0, a1
-; RV64ZICOND-NEXT:    sub a0, a0, a1
-; RV64ZICOND-NEXT:    beq a0, a2, .LBB56_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB56_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: ssubo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sgtz a2, a1
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    slt a0, a1, a0
-; RV64-NEXT:    beq a2, a0, .LBB57_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB57_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: ssubo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sgtz a2, a1
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    slt a0, a1, a0
-; RV64ZBA-NEXT:    beq a2, a0, .LBB57_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB57_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: ssubo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sgtz a2, a1
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    slt a0, a1, a0
-; RV64ZICOND-NEXT:    beq a2, a0, .LBB57_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB57_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: usubo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a1, a0, a1
-; RV64-NEXT:    bgeu a0, a1, .LBB58_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB58_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a0, a1, .LBB58_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB58_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    subw a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB58_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB58_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: usubo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    bgeu a0, a1, .LBB59_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB59_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: usubo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a0, a1, .LBB59_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB59_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: usubo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB59_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB59_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @smulo.br.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: smulo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulw a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    beq a0, a2, .LBB60_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB60_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulw a2, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    beq a0, a2, .LBB60_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB60_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulw a2, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    beq a0, a2, .LBB60_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB60_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: smulo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulh a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    beq a2, a0, .LBB61_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB61_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulh a2, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    srai a0, a0, 63
-; RV64ZBA-NEXT:    beq a2, a0, .LBB61_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB61_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulh a2, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a0, 63
-; RV64ZICOND-NEXT:    beq a2, a0, .LBB61_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB61_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @smulo2.br.i64(i64 %v1) {
-; RV64-LABEL: smulo2.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    li a1, -13
-; RV64-NEXT:    mulh a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    beq a2, a0, .LBB62_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB62_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: smulo2.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    li a1, -13
-; RV64ZBA-NEXT:    mulh a2, a0, a1
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    srai a0, a0, 63
-; RV64ZBA-NEXT:    beq a2, a0, .LBB62_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB62_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: smulo2.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    li a1, -13
-; RV64ZICOND-NEXT:    mulh a2, a0, a1
-; RV64ZICOND-NEXT:    mul a0, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a0, 63
-; RV64ZICOND-NEXT:    beq a2, a0, .LBB62_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB62_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @umulo.br.i32(i32 signext %v1, i32 signext %v2) {
-; RV64-LABEL: umulo.br.i32:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 32
-; RV64-NEXT:    beqz a0, .LBB63_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB63_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.br.i32:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    zext.w a1, a1
-; RV64ZBA-NEXT:    zext.w a0, a0
-; RV64ZBA-NEXT:    mul a0, a0, a1
-; RV64ZBA-NEXT:    srai a0, a0, 32
-; RV64ZBA-NEXT:    beqz a0, .LBB63_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB63_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.br.i32:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    slli a1, a1, 32
-; RV64ZICOND-NEXT:    slli a0, a0, 32
-; RV64ZICOND-NEXT:    mulhu a0, a0, a1
-; RV64ZICOND-NEXT:    srai a0, a0, 32
-; RV64ZICOND-NEXT:    beqz a0, .LBB63_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB63_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
-; RV64-LABEL: umulo.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    beqz a0, .LBB64_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB64_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    mulhu a0, a0, a1
-; RV64ZBA-NEXT:    beqz a0, .LBB64_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB64_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    mulhu a0, a0, a1
-; RV64ZICOND-NEXT:    beqz a0, .LBB64_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB64_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @umulo2.br.i64(i64 %v1) {
-; RV64-LABEL: umulo2.br.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    bgeu a1, a0, .LBB65_2
-; RV64-NEXT:  # %bb.1: # %overflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB65_2: # %continue
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: umulo2.br.i64:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a0
-; RV64ZBA-NEXT:    bgeu a1, a0, .LBB65_2
-; RV64ZBA-NEXT:  # %bb.1: # %overflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:    ret
-; RV64ZBA-NEXT:  .LBB65_2: # %continue
-; RV64ZBA-NEXT:    li a0, 1
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: umulo2.br.i64:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a0
-; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB65_2
-; RV64ZICOND-NEXT:  # %bb.1: # %overflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:    ret
-; RV64ZICOND-NEXT:  .LBB65_2: # %continue
-; RV64ZICOND-NEXT:    li a0, 1
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
-; RV64-LABEL: uaddo.i64.constant:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 2
-; RV64-NEXT:    sltu a0, a2, a0
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64.constant:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 2
-; RV64ZBA-NEXT:    sltu a0, a2, a0
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64.constant:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 2
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
-; RV64-LABEL: uaddo.i64.constant_2048:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 2047
-; RV64-NEXT:    addi a2, a2, 1
-; RV64-NEXT:    sltu a0, a2, a0
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64.constant_2048:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 2047
-; RV64ZBA-NEXT:    addi a2, a2, 1
-; RV64ZBA-NEXT:    sltu a0, a2, a0
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64.constant_2048:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 2047
-; RV64ZICOND-NEXT:    addi a2, a2, 1
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2048)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
-; RV64-LABEL: uaddo.i64.constant_2049:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 2047
-; RV64-NEXT:    addi a2, a2, 2
-; RV64-NEXT:    sltu a0, a2, a0
-; RV64-NEXT:    sd a2, 0(a1)
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64.constant_2049:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 2047
-; RV64ZBA-NEXT:    addi a2, a2, 2
-; RV64ZBA-NEXT:    sltu a0, a2, a0
-; RV64ZBA-NEXT:    sd a2, 0(a1)
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64.constant_2049:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 2047
-; RV64ZICOND-NEXT:    addi a2, a2, 2
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
-; RV64ZICOND-NEXT:    ret
-entry:
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, ptr %res
-  ret i1 %obit
-}
-
-define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
-; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    ld a1, 0(a0)
-; RV64-NEXT:    addi a0, a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB69_2
-; RV64-NEXT:  # %bb.1: # %IfOverflow
-; RV64-NEXT:    li a0, 0
-; RV64-NEXT:  .LBB69_2: # %IfNoOverflow
-; RV64-NEXT:    ret
-;
-; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
-; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    ld a1, 0(a0)
-; RV64ZBA-NEXT:    addi a0, a1, 2
-; RV64ZBA-NEXT:    bltu a0, a1, .LBB69_2
-; RV64ZBA-NEXT:  # %bb.1: # %IfOverflow
-; RV64ZBA-NEXT:    li a0, 0
-; RV64ZBA-NEXT:  .LBB69_2: # %IfNoOverflow
-; RV64ZBA-NEXT:    ret
-;
-; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
-; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    ld a1, 0(a0)
-; RV64ZICOND-NEXT:    addi a0, a1, 2
-; RV64ZICOND-NEXT:    bltu a0, a1, .LBB69_2
-; RV64ZICOND-NEXT:  # %bb.1: # %IfOverflow
-; RV64ZICOND-NEXT:    li a0, 0
-; RV64ZICOND-NEXT:  .LBB69_2: # %IfNoOverflow
-; RV64ZICOND-NEXT:    ret
-entry:
-  %v1 = load i64, ptr %p
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %IfNoOverflow, label %IfOverflow
-IfOverflow:
-  ret i64 0
-IfNoOverflow:
-  ret i64 %val
-}
-
-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 27a5773e64043a..1c14e2ca5ef87a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -29,10 +29,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vrgather.vv v13, v10, v12
 ; CHECK-NEXT:    vadd.vi v10, v11, -15
-; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
-; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 5e68d8cbb07556..016f95bfef7e71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -1379,9 +1379,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl)
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, a1
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmsne.vi v9, v9, 0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1400,9 +1400,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl)
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv.s.x v9, a1
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmsne.vi v9, v9, 0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1427,10 +1427,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vmsltu.vx v9, v9, a1
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1452,10 +1452,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    vmsltu.vx v9, v9, a1
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1483,10 +1483,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vmsltu.vx v9, v9, a1
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1510,10 +1510,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    vmsltu.vx v9, v9, a1
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1543,10 +1543,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    vmsltu.vx v9, v10, a1
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1572,10 +1572,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    vmsltu.vx v9, v10, a1
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1607,10 +1607,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vid.v v12
 ; RV32-NEXT:    vmsltu.vx v9, v12, a1
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 1
@@ -1638,10 +1638,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vid.v v12
 ; RV64-NEXT:    vmsltu.vx v9, v12, a1
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 1
@@ -1754,10 +1754,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI72_0)
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vle8.v v12, (a2)
+; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vid.v v16
 ; RV32-NEXT:    vmsltu.vx v14, v16, a1
 ; RV32-NEXT:    vsext.vf4 v16, v12
-; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vi v14, v12, 4
@@ -1798,10 +1798,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    addi a2, a2, %lo(.LCPI72_0)
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle8.v v12, (a2)
+; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    vmsltu.vx v14, v16, a1
 ; RV64-NEXT:    vsext.vf4 v16, v12
-; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 63005716860135..2de2b750e68080 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -26,10 +26,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
 ; CHECK-NEXT:    vrgather.vv v13, v10, v12
 ; CHECK-NEXT:    vadd.vi v10, v11, -15
-; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
-; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
 %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 6a4ebb6b30af26..bcb008857ad320 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -97,10 +97,10 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index bbf1df20aeda58..1efd57194fba61 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -460,9 +460,8 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) {
 ; RV32XTHEADMEMIDX-NEXT:    sltu a1, a3, a1
 ; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
 ; RV32XTHEADMEMIDX-NEXT:    sw a3, 0(a0)
-; RV32XTHEADMEMIDX-NEXT:    addi a5, a0, 64
 ; RV32XTHEADMEMIDX-NEXT:    sw a1, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    mv a0, a5
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 64
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: sdia:

>From efc8bd617b1361f91c7d415060f4360bf27f8e68 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Wed, 21 Aug 2024 22:29:01 +0200
Subject: [PATCH 03/15] Formatting

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 39 +++++++++++----------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 7c5bf9b5c3facd..750eaf2de48570 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -121,9 +121,8 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs {
   }
 };
 
-static std::optional<llvm::SmallVector<MachineInstr *>> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst,
-                                               SUnit *Src,
-                                               ScheduleDAGMCP &DG) {
+static std::optional<llvm::SmallVector<MachineInstr *>>
+moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   MachineInstr *DstInstr = Dst->getInstr();
   MachineInstr *SrcInstr = Src->getInstr();
   MachineBasicBlock *MBB = SrcInstr->getParent();
@@ -185,15 +184,15 @@ static std::optional<llvm::SmallVector<MachineInstr *>> moveInstructionsOutOfThe
   // processing stage. In some context it does matter what the parent of the
   // instruction was: Namely when we are starting the traversal with the source
   // of the copy propagation. This instruction must have the destination as a
-  // dependency. In case of other instruction than has the destination as a dependency, this
-  // dependency would mean the end of the traversal, but in this scenario this
-  // must be ignored. Let's say that we can not control what nodes to process
-  // and we come across the copy source. How do I know what node has that copy
-  // source as their dependency? We can check of which node is the copy source
-  // the dependency of. This list will alway contain the source. To decide if we
-  // have it as dependency of another instruction, we must check in the already
-  // traversed list if any of the instructions that is depended on the source is
-  // contained. This would introduce extra costs.
+  // dependency. In case of other instruction than has the destination as a
+  // dependency, this dependency would mean the end of the traversal, but in
+  // this scenario this must be ignored. Let's say that we can not control what
+  // nodes to process and we come across the copy source. How do I know what
+  // node has that copy source as their dependency? We can check of which node
+  // is the copy source the dependency of. This list will alway contain the
+  // source. To decide if we have it as dependency of another instruction, we
+  // must check in the already traversed list if any of the instructions that is
+  // depended on the source is contained. This would introduce extra costs.
   ProcessSNodeChildren(Edges, Dst, true);
   while (!Edges.empty()) {
     const auto *Current = Edges.front();
@@ -1156,9 +1155,9 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands,
   return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill();
 }
 
-void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
-                                           ScheduleDAGMCP &DG,
-                                           bool MoveDependenciesForBetterCopyPropagation) {
+void MachineCopyPropagation::propagateDefs(
+    MachineInstr &MI, ScheduleDAGMCP &DG,
+    bool MoveDependenciesForBetterCopyPropagation) {
   if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies())
     return;
 
@@ -1204,7 +1203,8 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
       SUnit *DstSUnit = DG.getSUnit(Copy);
       SUnit *SrcSUnit = DG.getSUnit(&MI);
 
-      InstructionsToMove = moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG);
+      InstructionsToMove =
+          moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit);
       if (!InstructionsToMove)
         continue;
     }
@@ -1273,8 +1273,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         // Unlike forward cp, we don't invoke propagateDefs here,
         // just let forward cp do COPY-to-COPY propagation.
         if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) {
-          Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII,
-                                     UseCopyInstr, MoveDependenciesForBetterCopyPropagation);
+          Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, UseCopyInstr,
+                                     MoveDependenciesForBetterCopyPropagation);
           Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII,
                                      UseCopyInstr);
           Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
@@ -1316,7 +1316,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
           }
         } else {
           Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
-                                     UseCopyInstr, MoveDependenciesForBetterCopyPropagation);
+                                     UseCopyInstr,
+                                     MoveDependenciesForBetterCopyPropagation);
         }
       }
     }

>From f65245965fd3a2ae083ad3b7b38083ec903f5e02 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Wed, 21 Aug 2024 23:24:21 +0200
Subject: [PATCH 04/15] Update tests

---
 .../AArch64/GlobalISel/arm64-atomic.ll        |  13 +-
 .../CodeGen/AArch64/arm64-non-pow2-ldst.ll    |   5 +-
 llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll   |  12 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |  18 +-
 llvm/test/CodeGen/AArch64/sext.ll             |  41 +-
 .../AArch64/sve-vector-deinterleave.ll        |  10 +-
 .../CodeGen/AArch64/sve-vector-interleave.ll  |   6 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |  49 +-
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      |  18 +-
 llvm/test/CodeGen/AArch64/zext.ll             |  41 +-
 .../CodeGen/ARM/Windows/wineh-framepointer.ll |   3 +
 llvm/test/CodeGen/ARM/aes-erratum-fix.ll      |  14 +-
 llvm/test/CodeGen/ARM/fpclamptosat_vec.ll     |  53 +-
 .../test/CodeGen/ARM/mcp-dest-regs-no-dup.mir |   3 +
 .../CodeGen/ARM/srem-seteq-illegal-types.ll   |  20 +-
 .../ARM/vecreduce-fadd-legalization-strict.ll |  14 +-
 llvm/test/CodeGen/Thumb/smul_fix_sat.ll       |  12 +-
 .../Thumb/umulo-128-legalisation-lowering.ll  |   4 +-
 .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll    |  89 +--
 .../CodeGen/Thumb2/mve-fptosi-sat-vector.ll   |  51 +-
 .../CodeGen/Thumb2/mve-fptoui-sat-vector.ll   |  47 +-
 .../CodeGen/Thumb2/mve-laneinterleaving.ll    |   6 +-
 llvm/test/CodeGen/Thumb2/mve-minmax.ll        |   7 +-
 llvm/test/CodeGen/Thumb2/mve-pred-ext.ll      |  26 +-
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll       |  24 +-
 llvm/test/CodeGen/Thumb2/mve-shufflemov.ll    |  50 +-
 llvm/test/CodeGen/Thumb2/mve-vabdus.ll        |   6 +-
 llvm/test/CodeGen/Thumb2/mve-vcvt16.ll        |   4 +-
 llvm/test/CodeGen/Thumb2/mve-vld4.ll          |   4 +-
 llvm/test/CodeGen/Thumb2/mve-vmovn.ll         |   4 +-
 llvm/test/CodeGen/Thumb2/mve-vst4.ll          |  14 +-
 .../CodeGen/Thumb2/mve-zext-masked-load.ll    |   2 +-
 llvm/test/CodeGen/X86/apx/mul-i1024.ll        |  31 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll     |   2 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll  |  38 +-
 .../CodeGen/X86/avx512-gfni-intrinsics.ll     |  72 +-
 .../test/CodeGen/X86/avx512-insert-extract.ll |  14 +-
 llvm/test/CodeGen/X86/avx512-mask-op.ll       |  10 +-
 .../X86/avx512bw-intrinsics-upgrade.ll        |   4 +-
 .../X86/avx512vl-intrinsics-upgrade.ll        |  12 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |   3 +-
 .../div-rem-pair-recomposition-unsigned.ll    |   8 +-
 llvm/test/CodeGen/X86/extract-bits.ll         |  18 +-
 llvm/test/CodeGen/X86/legalize-shl-vec.ll     |  12 +-
 llvm/test/CodeGen/X86/matrix-multiply.ll      |  71 +-
 llvm/test/CodeGen/X86/mul-i256.ll             |   6 +-
 llvm/test/CodeGen/X86/mul-i512.ll             |   2 +-
 .../X86/peephole-na-phys-copy-folding.ll      |   5 +-
 llvm/test/CodeGen/X86/pmulh.ll                |  18 +-
 llvm/test/CodeGen/X86/pr34177.ll              |   2 +-
 llvm/test/CodeGen/X86/pr61964.ll              |   6 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |  19 +-
 llvm/test/CodeGen/X86/smul_fix.ll             |   6 +-
 llvm/test/CodeGen/X86/smul_fix_sat.ll         |   3 +-
 .../X86/smulo-128-legalisation-lowering.ll    |  18 +-
 .../subvectorwise-store-of-vector-splat.ll    |  38 +-
 llvm/test/CodeGen/X86/umul_fix.ll             |   6 +-
 llvm/test/CodeGen/X86/umul_fix_sat.ll         |   6 +-
 llvm/test/CodeGen/X86/vec_umulo.ll            |   9 +-
 .../vector-interleaved-load-i16-stride-2.ll   |   4 +-
 .../vector-interleaved-load-i16-stride-3.ll   |  35 +-
 .../vector-interleaved-load-i16-stride-4.ll   |   4 +-
 .../vector-interleaved-load-i16-stride-5.ll   |  48 +-
 .../vector-interleaved-load-i16-stride-6.ll   |  36 +-
 .../vector-interleaved-load-i16-stride-7.ll   |  59 +-
 .../vector-interleaved-load-i16-stride-8.ll   |  84 +-
 .../vector-interleaved-load-i32-stride-3.ll   |  76 +-
 .../vector-interleaved-load-i32-stride-4.ll   |  64 +-
 .../vector-interleaved-load-i32-stride-5.ll   | 248 +++---
 .../vector-interleaved-load-i32-stride-6.ll   |  93 +--
 .../vector-interleaved-load-i32-stride-7.ll   | 122 +--
 .../vector-interleaved-load-i32-stride-8.ll   | 406 +++++-----
 .../vector-interleaved-load-i64-stride-4.ll   |  33 +-
 .../vector-interleaved-load-i64-stride-5.ll   |  42 +-
 .../vector-interleaved-load-i64-stride-6.ll   |  40 +-
 .../vector-interleaved-load-i64-stride-7.ll   | 468 ++++++-----
 .../vector-interleaved-load-i64-stride-8.ll   | 750 +++++++++---------
 .../vector-interleaved-load-i8-stride-3.ll    |  10 +-
 .../vector-interleaved-load-i8-stride-4.ll    |  14 +-
 .../vector-interleaved-load-i8-stride-5.ll    |  10 +-
 .../vector-interleaved-load-i8-stride-6.ll    |  16 +-
 .../vector-interleaved-load-i8-stride-7.ll    | 122 ++-
 .../vector-interleaved-load-i8-stride-8.ll    | 102 +--
 .../vector-interleaved-store-i16-stride-3.ll  | 150 ++--
 .../vector-interleaved-store-i16-stride-4.ll  |  64 +-
 .../vector-interleaved-store-i16-stride-5.ll  |  57 +-
 .../vector-interleaved-store-i16-stride-6.ll  |  87 +-
 .../vector-interleaved-store-i16-stride-7.ll  |  75 +-
 .../vector-interleaved-store-i16-stride-8.ll  |  60 +-
 .../vector-interleaved-store-i32-stride-2.ll  |  57 +-
 .../vector-interleaved-store-i32-stride-3.ll  |  11 +-
 .../vector-interleaved-store-i32-stride-5.ll  |  35 +-
 .../vector-interleaved-store-i32-stride-6.ll  | 192 ++---
 .../vector-interleaved-store-i32-stride-7.ll  | 189 +++--
 .../vector-interleaved-store-i32-stride-8.ll  |   8 +-
 .../vector-interleaved-store-i64-stride-3.ll  |  20 +-
 .../vector-interleaved-store-i64-stride-4.ll  | 176 ++--
 .../vector-interleaved-store-i64-stride-5.ll  |  20 +-
 .../vector-interleaved-store-i64-stride-7.ll  | 544 ++++++-------
 .../vector-interleaved-store-i64-stride-8.ll  | 304 +++----
 .../vector-interleaved-store-i8-stride-3.ll   |   5 +-
 .../vector-interleaved-store-i8-stride-5.ll   |  15 +-
 .../vector-interleaved-store-i8-stride-6.ll   |  71 +-
 .../vector-interleaved-store-i8-stride-7.ll   |  35 +-
 .../vector-interleaved-store-i8-stride-8.ll   |  15 +-
 llvm/test/CodeGen/X86/vector-intrinsics.ll    |   4 +-
 .../X86/vector-shuffle-combining-avx.ll       |  23 +-
 llvm/test/CodeGen/X86/vector-zext.ll          |   9 +-
 .../X86/wide-scalar-shift-legalization.ll     |   2 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |  21 +-
 llvm/test/CodeGen/X86/xmulo.ll                |  12 +-
 111 files changed, 3035 insertions(+), 3221 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 92575d701f4281..da75b5bbeb33bc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -106,11 +106,10 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 {
 ; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_from_load:
 ; CHECK-OUTLINE-O1:       ; %bb.0:
 ; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; CHECK-OUTLINE-O1-NEXT:    ldr w8, [x2]
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    ldr w1, [x2]
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
-; CHECK-OUTLINE-O1-NEXT:    mov w1, w8
 ; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq
 ; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-OUTLINE-O1-NEXT:    ret
@@ -6026,8 +6025,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6133,8 +6132,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6238,8 +6237,8 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
 ; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
 ; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6336,8 +6335,8 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
@@ -6434,8 +6433,8 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) {
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
 ; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
-; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
 ; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
 ; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
 ; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
index cd821675bae6e1..91d8cf98753c92 100644
--- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
@@ -54,11 +54,10 @@ define i280 @ldi280(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w9, [x0, #34]
 ; CHECK-NEXT:    ldrh w10, [x0, #32]
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldp x1, x2, [x0, #8]
 ; CHECK-NEXT:    ldr x3, [x0, #24]
+; CHECK-NEXT:    ldp x1, x2, [x0, #8]
+; CHECK-NEXT:    ldr x0, [x0]
 ; CHECK-NEXT:    orr x4, x10, x9, lsl #16
-; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
     %r = load i280, ptr %p
     ret i280 %r
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index 4558d7c464fe32..186d191444feb6 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -31,10 +31,10 @@ define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov w19, w1
 ; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
-; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
@@ -81,10 +81,10 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov w19, w1
 ; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
-; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas1_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w19, uxtb
@@ -126,10 +126,10 @@ define void @test_conditional(ptr %p, i32 %oldval, i32 %newval) {
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
 ; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
-; OUTLINE-ATOMICS-NEXT:    mov w19, w1
 ; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
-; OUTLINE-ATOMICS-NEXT:    mov w0, w19
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w19
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index cd227a02357660..a5d7ae147ffda2 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1618,23 +1618,23 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
 ; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
@@ -1687,7 +1687,6 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w8
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    fmov s5, w9
 ; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
@@ -1695,19 +1694,20 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
 ; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s3
 ; CHECK-GI-FP16-NEXT:    fmov s3, w7
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
 ; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
 ; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
 ; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 7d7b862098879c..2dcd71b39bd0ee 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -337,9 +337,9 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov x8, v0.h[0]
 ; CHECK-GI-NEXT:    smov x9, v0.h[1]
 ; CHECK-GI-NEXT:    smov x10, v0.h[2]
-; CHECK-GI-NEXT:    smov x8, v0.h[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -362,9 +362,9 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    smov x9, v0.s[1]
 ; CHECK-GI-NEXT:    smov x10, v0.s[2]
-; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -702,20 +702,18 @@ entry:
 define <8 x i64> @sext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v1.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll2 v5.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    sshll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <8 x i32> %a to <8 x i64>
@@ -879,20 +877,18 @@ entry:
 define <16 x i32> @sext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: sext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i16> %a to <16 x i32>
@@ -939,29 +935,26 @@ entry:
 define <16 x i64> @sext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v18.2d, v1.2s, #0
-; CHECK-SD-NEXT:    sshll2 v16.2d, v1.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v7.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    sshll v4.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    mov v2.16b, v18.16b
-; CHECK-SD-NEXT:    mov v3.16b, v16.16b
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll2 v19.2d, v1.4s, #0
-; CHECK-GI-NEXT:    sshll2 v17.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    sshll v18.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v19.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    sshll2 v7.2d, v3.4s, #0
-; CHECK-GI-NEXT:    sshll2 v5.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.16b, v17.16b
 ; CHECK-GI-NEXT:    mov v2.16b, v18.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v19.16b
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index eafb71a0b23a3e..63f2df84d81731 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -170,11 +170,10 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv8i64(<vscale x 8 x i64> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, z6.d
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
@@ -184,15 +183,14 @@ ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nxv16i64(<vscale x 16 x i64> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp2 z28.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp2 z29.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z3.d, z6.d, z7.d
 ; CHECK-NEXT:    uzp2 z7.d, z6.d, z7.d
 ; CHECK-NEXT:    uzp2 z6.d, z4.d, z5.d
-; CHECK-NEXT:    mov z1.d, z24.d
 ; CHECK-NEXT:    mov z4.d, z28.d
 ; CHECK-NEXT:    mov z5.d, z29.d
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index fe089fa4a6417e..39dda7fdfb57d6 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -165,10 +165,9 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 ; CHECK-LABEL: interleave2_nxv16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
 ; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z2.s
-; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
@@ -179,10 +178,9 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; CHECK-LABEL: interleave2_nxv8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
 ; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
-; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 150a67ab7974dd..54ada05c904487 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4819,47 +4819,45 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ldr d3, [x10]
 ; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ldr d2, [x10]
-; CHECK-GI-NEXT:    ldr d6, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    ldr d2, [x10]
+; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ldr d6, [x11]
+; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d4, [x10]
-; CHECK-GI-NEXT:    ldr d16, [x11]
 ; CHECK-GI-NEXT:    abs v5.4s, v5.4s
 ; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    ldr d4, [x10]
+; CHECK-GI-NEXT:    ldr d16, [x11]
 ; CHECK-GI-NEXT:    abs v7.4s, v7.4s
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    abs v1.4s, v1.4s
-; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
 ; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
 ; CHECK-GI-NEXT:    ldr d6, [x10]
 ; CHECK-GI-NEXT:    ldr d17, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
+; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-GI-NEXT:    ldr d5, [x10]
 ; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d5, [x10]
 ; CHECK-GI-NEXT:    ldr d7, [x11]
-; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
 ; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
+; CHECK-GI-NEXT:    ldr d17, [x11, x8]
+; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
 ; CHECK-GI-NEXT:    ldr d7, [x10, x9]
-; CHECK-GI-NEXT:    ldr d17, [x11, x8]
-; CHECK-GI-NEXT:    abs v19.4s, v19.4s
-; CHECK-GI-NEXT:    abs v4.4s, v4.4s
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
 ; CHECK-GI-NEXT:    abs v16.4s, v16.4s
 ; CHECK-GI-NEXT:    abs v3.4s, v3.4s
 ; CHECK-GI-NEXT:    abs v18.4s, v18.4s
@@ -4867,33 +4865,36 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
 ; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
-; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    abs v19.4s, v19.4s
+; CHECK-GI-NEXT:    abs v4.4s, v4.4s
 ; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
 ; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
 ; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
 ; CHECK-GI-NEXT:    abs v17.4s, v17.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    abs v6.4s, v6.4s
-; CHECK-GI-NEXT:    addv s4, v4.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-GI-NEXT:    addv s3, v3.4s
 ; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
 ; CHECK-GI-NEXT:    abs v16.4s, v16.4s
 ; CHECK-GI-NEXT:    abs v5.4s, v5.4s
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
-; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    addv s2, v2.4s
-; CHECK-GI-NEXT:    addv s3, v3.4s
-; CHECK-GI-NEXT:    fmov w10, s4
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    addv s4, v4.4s
+; CHECK-GI-NEXT:    fmov w10, s3
 ; CHECK-GI-NEXT:    abs v18.4s, v18.4s
 ; CHECK-GI-NEXT:    abs v7.4s, v7.4s
 ; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
 ; CHECK-GI-NEXT:    addv s3, v6.4s
+; CHECK-GI-NEXT:    fmov w9, s2
 ; CHECK-GI-NEXT:    add w8, w10, w8
+; CHECK-GI-NEXT:    fmov w10, s4
 ; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
 ; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 38ad96df79cf6d..66bb131ce72494 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2848,21 +2848,21 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
+; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #16
 ; CHECK-BE-NEXT:    umull v4.4s, v1.4h, v2.4h
-; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
+; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
+; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
 ; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
 ; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    b.ne .LBB24_1
@@ -2980,16 +2980,16 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    add x10, x0, #16
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
 ; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
+; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
 ; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
 ; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
 ; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
 ; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
 ; CHECK-BE-NEXT:    rev32 v17.8b, v17.8b
-; CHECK-BE-NEXT:    umull v5.2d, v5.2s, v18.2s
 ; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
+; CHECK-BE-NEXT:    umull v5.2d, v5.2s, v18.2s
 ; CHECK-BE-NEXT:    umull v18.2d, v21.2s, v22.2s
 ; CHECK-BE-NEXT:    ext v21.16b, v22.16b, v22.16b, #8
 ; CHECK-BE-NEXT:    rev32 v7.8b, v7.8b
@@ -2997,9 +2997,9 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    ext v16.16b, v16.16b, v16.16b, #8
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
+; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
 ; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
-; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
 ; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
 ; CHECK-BE-NEXT:    add x8, x0, #112
 ; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
@@ -3007,11 +3007,11 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    add x9, x0, #80
 ; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
 ; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #64
 ; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
 ; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB25_1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 5cd7fd15735f3d..aa4fe9dc4ff9a1 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -354,9 +354,9 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    umov w9, v0.h[1]
 ; CHECK-GI-NEXT:    umov w10, v0.h[2]
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -379,9 +379,9 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
@@ -727,20 +727,18 @@ entry:
 define <8 x i64> @zext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i32> %a to <8 x i64>
@@ -896,20 +894,18 @@ entry:
 define <16 x i32> @zext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: zext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i16> %a to <16 x i32>
@@ -956,29 +952,26 @@ entry:
 define <16 x i64> @zext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v18.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ushll2 v16.2d, v1.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v7.2d, v3.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v4.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    mov v2.16b, v18.16b
-; CHECK-SD-NEXT:    mov v3.16b, v16.16b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll2 v19.2d, v1.4s, #0
-; CHECK-GI-NEXT:    ushll2 v17.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    ushll v18.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v19.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v7.2d, v3.4s, #0
-; CHECK-GI-NEXT:    ushll2 v5.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.16b, v17.16b
 ; CHECK-GI-NEXT:    mov v2.16b, v18.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v19.16b
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll b/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll
index 17197006d82613..96557848ed401f 100644
--- a/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll
+++ b/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ;; Check that this produces the expected assembly output
 ; RUN: llc -mtriple=thumbv7-windows -o - %s -verify-machineinstrs | FileCheck %s
 ;; Also try to write an object file, which verifies that the SEH opcodes
@@ -174,3 +175,5 @@ declare void @llvm.va_start(ptr)
 declare void @llvm.va_end(ptr)
 
 declare arm_aapcs_vfpcc void @other2(i32 noundef, ptr noundef, ptr noundef, ptr noundef)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
index 43c403fe6d64df..82f3b453e5fecd 100644
--- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
+++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll
@@ -1451,8 +1451,6 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r2]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[1]
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r6, d17[0]
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #8] @ 4-byte Spill
@@ -1462,7 +1460,6 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r1]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    mov r3, r6
 ; CHECK-CORTEX-FIX-NEXT:    b .LBB36_3
 ; CHECK-CORTEX-FIX-NEXT:  .LBB36_2:
 ; CHECK-CORTEX-FIX-NEXT:    add r3, r2, #8
@@ -1479,12 +1476,12 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[3]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d17[0]
 ; CHECK-CORTEX-FIX-NEXT:  .LBB36_3:
+; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d17[0]
+; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r4, d17[3]
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
 ; CHECK-CORTEX-FIX-NEXT:    beq .LBB36_5
@@ -3604,8 +3601,6 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:  @ %bb.1:
 ; CHECK-CORTEX-FIX-NEXT:    vld1.64 {d16, d17}, [r2]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[1]
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r6, d17[0]
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #8] @ 4-byte Spill
@@ -3615,7 +3610,6 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #12] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    ldrh r3, [r1]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    mov r3, r6
 ; CHECK-CORTEX-FIX-NEXT:    b .LBB82_3
 ; CHECK-CORTEX-FIX-NEXT:  .LBB82_2:
 ; CHECK-CORTEX-FIX-NEXT:    add r3, r2, #8
@@ -3632,12 +3626,12 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d16[3]
 ; CHECK-CORTEX-FIX-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d17[0]
 ; CHECK-CORTEX-FIX-NEXT:  .LBB82_3:
+; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r3, d17[0]
+; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r7, d17[2]
 ; CHECK-CORTEX-FIX-NEXT:    vmov.u16 r4, d17[3]
 ; CHECK-CORTEX-FIX-NEXT:    cmp r0, #0
 ; CHECK-CORTEX-FIX-NEXT:    beq .LBB82_5
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
index b8ea7c10ad2f4c..631f4bcbdd51aa 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
@@ -1268,6 +1268,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
+; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -1301,15 +1302,14 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
-; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
-; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    vmov r0, s20
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov r0, s20
@@ -1513,6 +1513,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
+; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -1546,15 +1547,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
-; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
-; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    vmov r0, s20
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov r0, s20
@@ -2469,11 +2469,10 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NEXT:    moveq r0, r5
 ; CHECK-NEXT:    rsbs r1, r0, #0
 ; CHECK-NEXT:    rscs r1, r3, #0
-; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    movwlt r6, #1
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    movne r6, r0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_d2lz
 ; CHECK-NEXT:    subs r2, r0, r5
 ; CHECK-NEXT:    vmov.32 d0[0], r6
@@ -2632,12 +2631,10 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NEXT:    vorr q4, q0, q0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vmov r2, s16
 ; CHECK-NEXT:    mvn r7, #0
 ; CHECK-NEXT:    subs r3, r0, r7
-; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    sbcs r3, r1, #0
-; CHECK-NEXT:    vmov r9, s18
+; CHECK-NEXT:    mov r4, #0
 ; CHECK-NEXT:    mov r3, #0
 ; CHECK-NEXT:    mov r10, #0
 ; CHECK-NEXT:    movwlt r3, #1
@@ -2645,12 +2642,13 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NEXT:    movne r3, r1
 ; CHECK-NEXT:    moveq r0, r7
 ; CHECK-NEXT:    rsbs r1, r0, #0
-; CHECK-NEXT:    vmov r8, s17
+; CHECK-NEXT:    vmov r9, s18
 ; CHECK-NEXT:    rscs r1, r3, #0
+; CHECK-NEXT:    vmov r8, s17
 ; CHECK-NEXT:    movwlt r4, #1
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    movne r4, r0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    subs r2, r0, r7
 ; CHECK-NEXT:    mov r5, #0
@@ -2971,12 +2969,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov.f32 s20, s0
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
-; CHECK-NEON-NEXT:    vmov r2, s20
 ; CHECK-NEON-NEXT:    mvn r7, #0
 ; CHECK-NEON-NEXT:    subs r3, r0, r7
-; CHECK-NEON-NEXT:    mov r4, #0
 ; CHECK-NEON-NEXT:    sbcs r3, r1, #0
-; CHECK-NEON-NEXT:    vmov r8, s18
+; CHECK-NEON-NEXT:    mov r4, #0
 ; CHECK-NEON-NEXT:    mov r3, #0
 ; CHECK-NEON-NEXT:    mov r10, #0
 ; CHECK-NEON-NEXT:    movwlt r3, #1
@@ -2984,12 +2980,13 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEON-NEXT:    movne r3, r1
 ; CHECK-NEON-NEXT:    moveq r0, r7
 ; CHECK-NEON-NEXT:    rsbs r1, r0, #0
-; CHECK-NEON-NEXT:    vmov r9, s16
+; CHECK-NEON-NEXT:    vmov r8, s18
 ; CHECK-NEON-NEXT:    rscs r1, r3, #0
+; CHECK-NEON-NEXT:    vmov r9, s16
 ; CHECK-NEON-NEXT:    movwlt r4, #1
 ; CHECK-NEON-NEXT:    cmp r4, #0
 ; CHECK-NEON-NEXT:    movne r4, r0
-; CHECK-NEON-NEXT:    mov r0, r2
+; CHECK-NEON-NEXT:    vmov r0, s20
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEON-NEXT:    subs r2, r0, r7
@@ -3257,6 +3254,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
+; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -3290,15 +3288,14 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
-; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
-; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    vmov r0, s20
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov r0, s20
@@ -3499,6 +3496,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vmov r0, s1
 ; CHECK-NEON-NEXT:    vmov.f32 s16, s7
 ; CHECK-NEON-NEXT:    vmov.f32 s18, s6
+; CHECK-NEON-NEXT:    vmov.f32 s20, s5
 ; CHECK-NEON-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEON-NEXT:    vmov.f32 s24, s3
 ; CHECK-NEON-NEXT:    vmov.f32 s26, s2
@@ -3532,15 +3530,14 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov s2, r5
-; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
-; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s30
-; CHECK-NEON-NEXT:    vmov r1, s20
 ; CHECK-NEON-NEXT:    vmov.32 d8[0], r0
 ; CHECK-NEON-NEXT:    vmov r0, s0
 ; CHECK-NEON-NEXT:    vmov.32 d12[0], r0
-; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    vmov r0, s20
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s20, s2
 ; CHECK-NEON-NEXT:    bl __aeabi_h2f
 ; CHECK-NEON-NEXT:    vmov s0, r0
 ; CHECK-NEON-NEXT:    vmov r0, s20
diff --git a/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir
index c5a8fabfdc79b1..d4446726aaaa73 100644
--- a/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir
+++ b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=arm-eabi -O1 -run-pass=machine-cp %s -o - \
 # RUN: -verify-machineinstrs -simplify-mir | FileCheck %s
 
@@ -11,3 +12,5 @@ body:             |
 
     renamable $r9 = COPY killed renamable $r0
 ...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index 7f56215b9b4123..7c82f8294ade31 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -382,14 +382,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; ARM7-NEXT:    bl __moddi3
 ; ARM7-NEXT:    vmov.32 d8[0], r0
 ; ARM7-NEXT:    ldr r0, [sp, #44]
-; ARM7-NEXT:    ldr r2, [sp, #40]
 ; ARM7-NEXT:    mov r5, r1
+; ARM7-NEXT:    mvn r2, #8
 ; ARM7-NEXT:    and r0, r0, #1
 ; ARM7-NEXT:    mvn r3, #0
 ; ARM7-NEXT:    rsb r1, r0, #0
+; ARM7-NEXT:    ldr r0, [sp, #40]
 ; ARM7-NEXT:    vmov.32 d9[0], r7
-; ARM7-NEXT:    mov r0, r2
-; ARM7-NEXT:    mvn r2, #8
 ; ARM7-NEXT:    bl __moddi3
 ; ARM7-NEXT:    vmov.32 d16[0], r0
 ; ARM7-NEXT:    adr r0, .LCPI3_0
@@ -458,14 +457,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; ARM8-NEXT:    bl __moddi3
 ; ARM8-NEXT:    vmov.32 d8[0], r0
 ; ARM8-NEXT:    ldr r0, [sp, #44]
-; ARM8-NEXT:    ldr r2, [sp, #40]
 ; ARM8-NEXT:    mov r5, r1
+; ARM8-NEXT:    mvn r2, #8
 ; ARM8-NEXT:    and r0, r0, #1
 ; ARM8-NEXT:    mvn r3, #0
 ; ARM8-NEXT:    rsb r1, r0, #0
+; ARM8-NEXT:    ldr r0, [sp, #40]
 ; ARM8-NEXT:    vmov.32 d9[0], r7
-; ARM8-NEXT:    mov r0, r2
-; ARM8-NEXT:    mvn r2, #8
 ; ARM8-NEXT:    bl __moddi3
 ; ARM8-NEXT:    vmov.32 d16[0], r0
 ; ARM8-NEXT:    adr r0, .LCPI3_0
@@ -534,14 +532,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; NEON7-NEXT:    bl __moddi3
 ; NEON7-NEXT:    vmov.32 d8[0], r0
 ; NEON7-NEXT:    ldr r0, [sp, #44]
-; NEON7-NEXT:    ldr r2, [sp, #40]
 ; NEON7-NEXT:    mov r5, r1
+; NEON7-NEXT:    mvn r2, #8
 ; NEON7-NEXT:    and r0, r0, #1
 ; NEON7-NEXT:    mvn r3, #0
 ; NEON7-NEXT:    rsb r1, r0, #0
+; NEON7-NEXT:    ldr r0, [sp, #40]
 ; NEON7-NEXT:    vmov.32 d9[0], r7
-; NEON7-NEXT:    mov r0, r2
-; NEON7-NEXT:    mvn r2, #8
 ; NEON7-NEXT:    bl __moddi3
 ; NEON7-NEXT:    vmov.32 d16[0], r0
 ; NEON7-NEXT:    adr r0, .LCPI3_0
@@ -610,14 +607,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; NEON8-NEXT:    bl __moddi3
 ; NEON8-NEXT:    vmov.32 d8[0], r0
 ; NEON8-NEXT:    ldr r0, [sp, #44]
-; NEON8-NEXT:    ldr r2, [sp, #40]
 ; NEON8-NEXT:    mov r5, r1
+; NEON8-NEXT:    mvn r2, #8
 ; NEON8-NEXT:    and r0, r0, #1
 ; NEON8-NEXT:    mvn r3, #0
 ; NEON8-NEXT:    rsb r1, r0, #0
+; NEON8-NEXT:    ldr r0, [sp, #40]
 ; NEON8-NEXT:    vmov.32 d9[0], r7
-; NEON8-NEXT:    mov r0, r2
-; NEON8-NEXT:    mvn r2, #8
 ; NEON8-NEXT:    bl __moddi3
 ; NEON8-NEXT:    vmov.32 d16[0], r0
 ; NEON8-NEXT:    adr r0, .LCPI3_0
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
index c40dd2e9229633..355cff220de6fd 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll
@@ -92,21 +92,21 @@ define double @test_v1f64_neutral(<1 x double> %a) nounwind {
 define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r11, lr}
-; CHECK-NEXT:    push {r4, r5, r11, lr}
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    str r0, [sp]
 ; CHECK-NEXT:    str r1, [sp, #4]
 ; CHECK-NEXT:    str r2, [sp, #8]
 ; CHECK-NEXT:    str r3, [sp, #12]
-; CHECK-NEXT:    ldr r0, [sp, #32]
-; CHECK-NEXT:    ldr r1, [sp, #36]
-; CHECK-NEXT:    ldr r2, [sp, #40]
-; CHECK-NEXT:    ldr r3, [sp, #44]
+; CHECK-NEXT:    ldr r0, [sp, #24]
+; CHECK-NEXT:    ldr r1, [sp, #28]
+; CHECK-NEXT:    ldr r2, [sp, #32]
+; CHECK-NEXT:    ldr r3, [sp, #36]
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    pop {r4, r5, r11, lr}
+; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
   %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 %s, <1 x fp128> %a)
   ret fp128 %b
diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
index abb7fff831afe5..69cb84ba97d27f 100644
--- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
@@ -204,8 +204,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define i4 @func3(i4 %x, i4 %y) nounwind {
 ; ARM-LABEL: func3:
 ; ARM:       @ %bb.0:
-; ARM-NEXT:    .save {r4, lr}
-; ARM-NEXT:    push {r4, lr}
+; ARM-NEXT:    .save {r7, lr}
+; ARM-NEXT:    push {r7, lr}
 ; ARM-NEXT:    lsls r0, r0, #28
 ; ARM-NEXT:    lsls r1, r1, #28
 ; ARM-NEXT:    asrs r2, r1, #28
@@ -230,7 +230,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; ARM-NEXT:    lsls r0, r2, #31
 ; ARM-NEXT:  .LBB2_5:
 ; ARM-NEXT:    asrs r0, r0, #28
-; ARM-NEXT:    pop {r4, pc}
+; ARM-NEXT:    pop {r7, pc}
 ; ARM-NEXT:    .p2align 2
 ; ARM-NEXT:  @ %bb.6:
 ; ARM-NEXT:  .LCPI2_0:
@@ -383,8 +383,8 @@ define i64 @func5(i64 %x, i64 %y) {
 define i4 @func6(i4 %x, i4 %y) nounwind {
 ; ARM-LABEL: func6:
 ; ARM:       @ %bb.0:
-; ARM-NEXT:    .save {r4, lr}
-; ARM-NEXT:    push {r4, lr}
+; ARM-NEXT:    .save {r7, lr}
+; ARM-NEXT:    push {r7, lr}
 ; ARM-NEXT:    lsls r0, r0, #28
 ; ARM-NEXT:    lsls r1, r1, #28
 ; ARM-NEXT:    asrs r2, r1, #28
@@ -407,7 +407,7 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
 ; ARM-NEXT:    mov r2, r0
 ; ARM-NEXT:  .LBB5_5:
 ; ARM-NEXT:    asrs r0, r2, #28
-; ARM-NEXT:    pop {r4, pc}
+; ARM-NEXT:    pop {r7, pc}
 ; ARM-NEXT:    .p2align 2
 ; ARM-NEXT:  @ %bb.6:
 ; ARM-NEXT:  .LCPI5_0:
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index b0cc1c68862983..9b5fa1c2bc8113 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -9,10 +9,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-NEXT:    .pad #60
 ; THUMBV6-NEXT:    sub sp, #60
 ; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    mov r4, r0
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV6-NEXT:    mov r1, r2
 ; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV6-NEXT:    ldr r2, [sp, #88]
 ; THUMBV6-NEXT:    str r2, [sp, #48] @ 4-byte Spill
 ; THUMBV6-NEXT:    movs r5, #0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
index de4b24da27a8d4..370f706e420b26 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
@@ -622,23 +622,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    vmovx.f16 s6, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s5, s3
+; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s7, s2
-; CHECK-NEXT:    vcvt.s32.f16 s8, s1
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vcvt.s32.f16 s14, s0
 ; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vcvt.s32.f16 s14, s0
-; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vcvt.s32.f16 s8, s1
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
@@ -675,14 +675,13 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    vmov r12, lr, d8
 ; CHECK-NEXT:    subs.w r4, r0, #-1
 ; CHECK-NEXT:    mvn r9, #-2147483648
 ; CHECK-NEXT:    sbcs.w r4, r1, r9
-; CHECK-NEXT:    sbcs r4, r2, #0
 ; CHECK-NEXT:    mov.w r7, #-1
-; CHECK-NEXT:    sbcs r4, r3, #0
+; CHECK-NEXT:    sbcs r4, r2, #0
 ; CHECK-NEXT:    mov.w r10, #-2147483648
+; CHECK-NEXT:    sbcs r4, r3, #0
 ; CHECK-NEXT:    cset r4, lt
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csel r3, r3, r4, ne
@@ -696,8 +695,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    cset r5, lt
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csel r8, r1, r10, ne
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    subs.w r6, r0, #-1
 ; CHECK-NEXT:    sbcs.w r6, r1, r9
@@ -737,23 +735,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-LABEL: utest_f64i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r5, r6, r7, lr}
+; CHECK-NEXT:    push {r5, r6, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    cset r6, lo
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    csel r7, r0, r6, ne
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __fixunsdfti
 ; CHECK-NEXT:    subs r2, #1
 ; CHECK-NEXT:    sbcs r2, r3, #0
@@ -767,8 +762,7 @@ define arm_aapcs_vfpcc <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r7
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r5, r6, r7, pc}
 entry:
   %conv = fptoui <2 x double> %x to <2 x i128>
   %0 = icmp ult <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -789,12 +783,11 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    vmov r12, lr, d8
 ; CHECK-NEXT:    subs r4, r2, #1
-; CHECK-NEXT:    sbcs r4, r3, #0
 ; CHECK-NEXT:    mov.w r8, #1
+; CHECK-NEXT:    sbcs r4, r3, #0
+; CHECK-NEXT:    mov.w r7, #0
 ; CHECK-NEXT:    cset r5, lt
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csel r0, r0, r5, ne
 ; CHECK-NEXT:    csel r3, r3, r5, ne
@@ -807,8 +800,7 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    cset r6, lt
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    csel r9, r0, r6, ne
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    subs r5, r2, #1
 ; CHECK-NEXT:    sbcs r5, r3, #0
@@ -1704,23 +1696,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    vmovx.f16 s6, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s5, s3
+; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s7, s2
-; CHECK-NEXT:    vcvt.s32.f16 s8, s1
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vcvt.s32.f16 s14, s0
 ; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vcvt.s32.f16 s14, s0
-; CHECK-NEXT:    vcvt.s32.f16 s12, s0
+; CHECK-NEXT:    vcvt.s32.f16 s8, s1
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
@@ -1755,14 +1747,13 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    vmov r12, lr, d8
 ; CHECK-NEXT:    subs.w r4, r0, #-1
 ; CHECK-NEXT:    mvn r9, #-2147483648
 ; CHECK-NEXT:    sbcs.w r4, r1, r9
-; CHECK-NEXT:    sbcs r4, r2, #0
 ; CHECK-NEXT:    mov.w r7, #-1
-; CHECK-NEXT:    sbcs r4, r3, #0
+; CHECK-NEXT:    sbcs r4, r2, #0
 ; CHECK-NEXT:    mov.w r10, #-2147483648
+; CHECK-NEXT:    sbcs r4, r3, #0
 ; CHECK-NEXT:    cset r4, lt
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csel r3, r3, r4, ne
@@ -1776,8 +1767,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    cset r5, lt
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    csel r8, r1, r10, ne
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r1, lr
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    subs.w r6, r0, #-1
 ; CHECK-NEXT:    sbcs.w r6, r1, r9
@@ -1815,23 +1805,20 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-LABEL: utest_f64i64_mm:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r5, r6, r7, lr}
+; CHECK-NEXT:    push {r5, r6, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    cset r6, lo
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    csel r7, r0, r6, ne
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __fixunsdfti
 ; CHECK-NEXT:    subs r2, #1
 ; CHECK-NEXT:    sbcs r2, r3, #0
@@ -1845,8 +1832,7 @@ define arm_aapcs_vfpcc <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r7
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r5, r6, r7, pc}
 entry:
   %conv = fptoui <2 x double> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -1864,18 +1850,17 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __fixdfti
-; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    vmov r4, r1, d8
 ; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    sbcs r2, r3, #0
 ; CHECK-NEXT:    cset r7, lt
 ; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    csel r6, r0, r7, ne
+; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    csel r5, r3, r7, ne
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it mi
 ; CHECK-NEXT:    movmi r6, #0
-; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl __fixdfti
 ; CHECK-NEXT:    subs r2, #1
 ; CHECK-NEXT:    sbcs r2, r3, #0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index aa8c618b412748..0f7f31077414a3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -1379,12 +1379,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) {
 ; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s8, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
@@ -1404,11 +1404,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s10, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s2
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
-; CHECK-NEXT:    vcvt.s32.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vcvt.s32.f16 s8, s8
 ; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s6
@@ -1431,11 +1431,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s2
-; CHECK-NEXT:    vcvt.s32.f16 s2, s2
-; CHECK-NEXT:    vcvt.s32.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s8, s8
+; CHECK-NEXT:    vcvt.s32.f16 s10, s10
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vcvt.s32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s6, s3
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
@@ -1465,9 +1465,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32(<8 x half> %f) {
 ; CHECK-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s4
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -2122,12 +2122,10 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    mov r10, r3
-; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vldr s22, .LCPI30_0
-; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    vldr s20, .LCPI30_1
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r7, s17
 ; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
@@ -2167,7 +2165,8 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    str.w r0, [r9, #25]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    mov r11, r3
@@ -2372,20 +2371,18 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i128:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    vldr s22, .LCPI31_0
-; CHECK-NEXT:    vldr s20, .LCPI31_1
 ; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    vldr s20, .LCPI31_1
+; CHECK-NEXT:    vmov r6, s17
 ; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
@@ -2438,8 +2435,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    str r0, [r4, #48]
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    vmov r6, s17
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2602,8 +2598,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    str r0, [r4]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI31_0:
@@ -4823,9 +4818,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> %
 ; CHECK-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.s32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
 ; CHECK-NEXT:    vcvt.s32.f16 s4, s4
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.s32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 1849341ce72b7b..f3386f27bb5e7a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -1145,12 +1145,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) {
 ; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s8, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vcvt.u32.f16 s4, s4
 ; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vcvt.u32.f16 s4, s4
 ; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
@@ -1170,11 +1170,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s10, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s2
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
-; CHECK-NEXT:    vcvt.u32.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vcvt.u32.f16 s8, s8
 ; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
 ; CHECK-NEXT:    vmov r1, s6
@@ -1197,11 +1197,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s2
-; CHECK-NEXT:    vcvt.u32.f16 s2, s2
-; CHECK-NEXT:    vcvt.u32.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s8, s8
+; CHECK-NEXT:    vcvt.u32.f16 s10, s10
 ; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vcvt.u32.f16 s2, s2
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s6, s3
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r1
@@ -1231,9 +1231,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32(<8 x half> %f) {
 ; CHECK-NEXT:    vcvt.u32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s4
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
@@ -1739,14 +1739,13 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov r9, r3
-; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vldr s20, .LCPI30_0
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mov r9, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
@@ -1769,9 +1768,9 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    str.w r0, [r8, #25]
-; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    vmov r7, s17
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    mov r10, r3
@@ -1922,23 +1921,20 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i128:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    vldr s20, .LCPI31_0
 ; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
@@ -1970,9 +1966,9 @@ define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    str r0, [r4, #48]
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r7, s16
 ; CHECK-NEXT:    vmov r6, s17
-; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
@@ -2087,8 +2083,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    str r0, [r4]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI31_0:
@@ -3735,9 +3730,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32_duplicate(<8 x half>
 ; CHECK-NEXT:    vcvt.u32.f16 s14, s2
 ; CHECK-NEXT:    vcvt.u32.f16 s2, s1
 ; CHECK-NEXT:    vcvt.u32.f16 s0, s0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vcvt.u32.f16 s6, s6
 ; CHECK-NEXT:    vcvt.u32.f16 s4, s4
+; CHECK-NEXT:    vcvt.u32.f16 s6, s6
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vcvt.u32.f16 s12, s3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index e67b2fe32b7e2a..fe28f785623ed5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: ext_add_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.f32 s8, s3
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.f32 s2, s5
-; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    add.w r12, r1, r0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r0, s0
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index d536e6b72ac9c3..dc4b8c94a8c676 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -310,13 +310,12 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    vmov r2, r3, d10
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    vmov r12, r1, d9
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vmov r2, r3, d11
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    csetm r0, ne
-; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    vmov r2, r3, d11
 ; CHECK-NEXT:    bfi r4, r0, #0, #8
-; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    csetm r0, ne
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index 117469f3bd788b..9bac575b571a26 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -171,12 +171,11 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2f64(<2 x double> %src) {
 ; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    mov r3, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_dcmpeq
-; CHECK-MVE-NEXT:    vmov r2, r1, d8
 ; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmov r0, r1, d8
+; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    mov r3, r5
 ; CHECK-MVE-NEXT:    csetm r6, eq
-; CHECK-MVE-NEXT:    mov r0, r2
-; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    bl __aeabi_dcmpeq
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    csetm r0, eq
@@ -407,14 +406,13 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2f64(<2 x double> %src) {
 ; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    mov r3, r5
 ; CHECK-MVE-NEXT:    bl __aeabi_dcmpeq
-; CHECK-MVE-NEXT:    vmov r2, r1, d8
-; CHECK-MVE-NEXT:    adr r3, .LCPI13_1
 ; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vmov r0, r1, d8
+; CHECK-MVE-NEXT:    adr r3, .LCPI13_1
+; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    vldrw.u32 q4, [r3]
 ; CHECK-MVE-NEXT:    mov r3, r5
 ; CHECK-MVE-NEXT:    csetm r6, eq
-; CHECK-MVE-NEXT:    mov r0, r2
-; CHECK-MVE-NEXT:    mov r2, r4
 ; CHECK-MVE-NEXT:    bl __aeabi_dcmpeq
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    csetm r0, eq
@@ -942,8 +940,7 @@ define arm_aapcs_vfpcc <2 x double> @uitofp_v2i1_v2f64(<2 x i64> %src) {
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sbcs.w r2, r4, r3
-; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    cset r0, lt
 ; CHECK-NEXT:    bl __aeabi_ui2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -973,8 +970,7 @@ define arm_aapcs_vfpcc <2 x double> @sitofp_v2i1_v2f64(<2 x i64> %src) {
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    sbcs.w r2, r4, r3
-; CHECK-NEXT:    csetm r2, lt
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    csetm r0, lt
 ; CHECK-NEXT:    bl __aeabi_i2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
@@ -996,14 +992,13 @@ define arm_aapcs_vfpcc <2 x double> @fptoui_v2i1_v2f64(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2iz
-; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    adr r3, .LCPI28_0
 ; CHECK-NEXT:    bfi r4, r0, #0, #8
+; CHECK-NEXT:    vmov r0, r1, d9
+; CHECK-NEXT:    adr r3, .LCPI28_0
 ; CHECK-NEXT:    vmov.i32 q4, #0x0
 ; CHECK-NEXT:    vldrw.u32 q5, [r3]
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl __aeabi_d2iz
 ; CHECK-NEXT:    rsbs r0, r0, #0
 ; CHECK-NEXT:    bfi r4, r0, #8, #8
@@ -1034,13 +1029,12 @@ define arm_aapcs_vfpcc <2 x double> @fptosi_v2i1_v2f64(<2 x double> %src) {
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    bl __aeabi_d2iz
-; CHECK-NEXT:    vmov r2, r1, d9
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    adr r3, .LCPI29_0
 ; CHECK-NEXT:    bfi r4, r0, #0, #8
+; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    vmov.i32 q4, #0x0
 ; CHECK-NEXT:    vldrw.u32 q5, [r3]
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl __aeabi_d2iz
 ; CHECK-NEXT:    bfi r4, r0, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index aaee97318ecd0d..f4643f8c6c4a1f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -7,10 +7,10 @@
 define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) {
 ; CHECK-LABEL: shuffle1_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -30,10 +30,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) {
 ; CHECK-LABEL: shuffle3_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -923,10 +923,10 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
 ; CHECK-LABEL: shuffle2_i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -948,10 +948,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) {
 ; CHECK-LABEL: shuffle1_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -971,10 +971,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) {
 ; CHECK-LABEL: shuffle3_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1383,10 +1383,10 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
 ; CHECK-LABEL: shuffle2_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index de9328a3c24234..6ce75500142964 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -7,10 +7,10 @@
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -21,10 +21,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_67452301:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -69,10 +69,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_u7u5u3u1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -83,10 +83,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_6u4u2u0u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -120,10 +120,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cdef89ab45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -213,10 +213,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -227,10 +227,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -241,10 +241,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) {
 ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -261,9 +261,9 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <1
 ; CHECK-NEXT:    vmov.8 q1[9], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vmov.8 q1[11], r0
-; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -278,10 +278,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_45670123:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -292,10 +292,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_67452301:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -340,10 +340,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_u7u5u3u1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -354,10 +354,10 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_6u4u2u0u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s4, s3
+; CHECK-NEXT:    vmov.f32 s5, s2
 ; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 04dcc77e9f937b..1279714b5a78ca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -374,17 +374,17 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y,
 ; CHECK-NEXT:  .LBB17_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov r7, s4
 ; CHECK-NEXT:    vmov.f32 s6, s7
-; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s10, s5
 ; CHECK-NEXT:    vmov.f32 s14, s11
-; CHECK-NEXT:    subs.w r8, r3, r4
+; CHECK-NEXT:    vmov r4, s12
 ; CHECK-NEXT:    asr.w r12, r3, #31
+; CHECK-NEXT:    subs.w r8, r3, r4
 ; CHECK-NEXT:    sbc.w r12, r12, r4, asr #31
 ; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    vmov.f32 s10, s9
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
index 82db1a95037a9d..a5725a2a300483 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
@@ -18,10 +18,10 @@ entry:
 define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
 ; CHECK-LABEL: fpext_8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
 ; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
 ; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
 ; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
 ; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index 271beac1392880..b49f19e55c895a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -10,15 +10,17 @@ define void @vld4_v2i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vmov.f32 s8, s3
 ; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    add r0, r2
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add.w r12, r2, r0
 ; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s12
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index 8e6e0191e26703..b005cb92dc5169 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -330,8 +330,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_t2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
@@ -420,8 +420,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s7, s3
 ; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 176246427e64ce..d107edbc4c9097 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -131,19 +131,18 @@ define void @vst4_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q5, q1
 ; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
@@ -897,19 +896,18 @@ define void @vst4_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q5, q1
 ; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
index bad1bf15c145f3..297667cf979add 100644
--- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
@@ -65,9 +65,9 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s2, s19
 ; CHECK-NEXT:    vmov d9, r0, r1
+; CHECK-NEXT:    vand q5, q0, q5
 ; CHECK-NEXT:    vmov r0, r1, d12
 ; CHECK-NEXT:    vmov r4, r5, d11
-; CHECK-NEXT:    vand q5, q0, q5
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov d8, r0, r1
 ; CHECK-NEXT:    mov r0, r4
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index f9d6663b5b8a30..ded4134589c0f3 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -19,14 +19,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq 24(%rdi), %r29
 ; EGPR-NEXT:    movq 16(%rdi), %r17
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq 32(%rdi), %r10
-; EGPR-NEXT:    movq 56(%rdi), %r15
-; EGPR-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NEXT:    movq 48(%r24), %r12
 ; EGPR-NEXT:    movq 24(%rsi), %r23
 ; EGPR-NEXT:    movq 16(%rsi), %r11
 ; EGPR-NEXT:    movq (%rsi), %r27
 ; EGPR-NEXT:    movq 8(%rsi), %r14
+; EGPR-NEXT:    movq 40(%rdi), %rsi
+; EGPR-NEXT:    movq 32(%rdi), %r10
+; EGPR-NEXT:    movq 56(%rdi), %r15
+; EGPR-NEXT:    movq 48(%rdi), %r12
 ; EGPR-NEXT:    movq %r12, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
@@ -55,7 +55,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r20
 ; EGPR-NEXT:    movq %rax, %r25
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %rsi, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r22
@@ -69,7 +69,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r21, %r20
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %rsi, %rax
 ; EGPR-NEXT:    mulq %r14
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r22
@@ -84,8 +84,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %rsi, %rax
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r19
 ; EGPR-NEXT:    movq %rax, %r20
@@ -99,7 +99,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r19, %rbx
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %rsi, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %r8
@@ -1295,22 +1295,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    adcq %rcx, %r29, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
-; EGPR-NDD-NEXT:    movq %r11, %r14
-; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq 48(%r11), %r11
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq 48(%r15), %r11
 ; EGPR-NDD-NEXT:    movq %r17, %rsi
 ; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %r10, %rsi
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    movq %r16, %r10
-; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    mulq %r11
 ; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index ab48bc292a40cc..3fb994cdb751a3 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -332,11 +332,11 @@ define void @store_i256(ptr %ptr, i256 %v) {
 ; CHECK-O3:       # %bb.0:
 ; CHECK-O3-NEXT:    subq $40, %rsp
 ; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-O3-NEXT:    movq %rsi, (%rsp)
 ; CHECK-O3-NEXT:    movq %rdi, %rax
 ; CHECK-O3-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-O3-NEXT:    movq %rsi, (%rsp)
 ; CHECK-O3-NEXT:    movq %rsp, %rdx
 ; CHECK-O3-NEXT:    movl $32, %edi
 ; CHECK-O3-NEXT:    movq %rax, %rsi
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 5d97a56ce12c6b..b39b089faa2a5e 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -937,17 +937,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kmovw %k1, %r12d
 ; KNL-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r13d
+; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    andl $1, %edx
 ; KNL-NEXT:    movb %dl, 2(%rax)
 ; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    andl $1, %edx
 ; KNL-NEXT:    andl $1, %r9d
-; KNL-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL-NEXT:    leal (%rdx,%r9,2), %r9d
 ; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    andl $1, %r8d
 ; KNL-NEXT:    leal (%r9,%r8,4), %r9d
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    andl $1, %esi
 ; KNL-NEXT:    leal (%r9,%rsi,8), %esi
@@ -1250,17 +1250,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kmovd %k1, %r12d
 ; SKX-NEXT:    kshiftrd $13, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r13d
+; SKX-NEXT:    kshiftrd $14, %k0, %k1
 ; SKX-NEXT:    andl $1, %edx
 ; SKX-NEXT:    movb %dl, 2(%rax)
 ; SKX-NEXT:    kmovd %k0, %edx
 ; SKX-NEXT:    andl $1, %edx
 ; SKX-NEXT:    andl $1, %r9d
-; SKX-NEXT:    kshiftrd $14, %k0, %k1
 ; SKX-NEXT:    leal (%rdx,%r9,2), %r9d
 ; SKX-NEXT:    kmovd %k1, %edx
+; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    andl $1, %r8d
 ; SKX-NEXT:    leal (%r9,%r8,4), %r9d
-; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %r8d
 ; SKX-NEXT:    andl $1, %esi
 ; SKX-NEXT:    leal (%r9,%rsi,8), %esi
@@ -1563,56 +1563,56 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ecx
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    movb %bl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
 ; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebx
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %esi
 ; KNL_X32-NEXT:    leal (%ebx,%esi,4), %ebx
-; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
 ; KNL_X32-NEXT:    leal (%ebx,%edi,8), %ebx
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $4, %edx
 ; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $5, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $6, %ebp
 ; KNL_X32-NEXT:    andl $1, %esi
 ; KNL_X32-NEXT:    shll $7, %esi
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    orl %ebp, %esi
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
 ; KNL_X32-NEXT:    shll $8, %edi
 ; KNL_X32-NEXT:    orl %esi, %edi
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    shll $9, %ebx
 ; KNL_X32-NEXT:    orl %edi, %ebx
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $10, %edx
 ; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    orl %ecx, %edx
 ; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    orl %ecx, %edx
 ; KNL_X32-NEXT:    kmovw %k0, %ecx
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $11, %ebp
@@ -1891,17 +1891,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kmovd %k1, %r12d
 ; FASTISEL-NEXT:    kshiftrd $13, %k0, %k1
 ; FASTISEL-NEXT:    kmovd %k1, %r13d
+; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
 ; FASTISEL-NEXT:    andl $1, %edx
 ; FASTISEL-NEXT:    movb %dl, 2(%rax)
 ; FASTISEL-NEXT:    kmovd %k0, %edx
 ; FASTISEL-NEXT:    andl $1, %edx
 ; FASTISEL-NEXT:    andl $1, %r9d
-; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
 ; FASTISEL-NEXT:    leal (%rdx,%r9,2), %r9d
 ; FASTISEL-NEXT:    kmovd %k1, %edx
+; FASTISEL-NEXT:    kshiftrd $15, %k0, %k0
 ; FASTISEL-NEXT:    andl $1, %r8d
 ; FASTISEL-NEXT:    leal (%r9,%r8,4), %r9d
-; FASTISEL-NEXT:    kshiftrd $15, %k0, %k0
 ; FASTISEL-NEXT:    kmovd %k0, %r8d
 ; FASTISEL-NEXT:    andl $1, %esi
 ; FASTISEL-NEXT:    leal (%r9,%rsi,8), %esi
@@ -3113,22 +3113,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-NEXT:    kmovw %k1, %eax
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
+; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    andb $1, %dl
 ; KNL_X32-NEXT:    addb %dl, %dl
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
 ; KNL_X32-NEXT:    orb %bl, %dl
 ; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    shlb $2, %bl
 ; KNL_X32-NEXT:    orb %dl, %bl
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL_X32-NEXT:    andb $1, %dl
 ; KNL_X32-NEXT:    shlb $3, %dl
 ; KNL_X32-NEXT:    orb %bl, %dl
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    shlb $4, %bl
diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
index 2cace3060def4b..1e6b30400d8190 100644
--- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -10,18 +10,18 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_128:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_128:
@@ -69,18 +69,18 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_256:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_256:
@@ -135,18 +135,18 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_512:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512:
@@ -221,18 +221,18 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_128:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_128:
@@ -280,18 +280,18 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_256:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_256:
@@ -346,18 +346,18 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
-; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04]
-; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_512:
 ; X64BW:       # %bb.0:
 ; X64BW-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64BW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
-; X64BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X64BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04]
-; X64BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
+; X64BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
+; X64BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03]
+; X64BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
 ; X64BW-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_512:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 113828ae54ccd2..2a77d0238721c0 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -313,10 +313,10 @@ define i16 @test15(ptr%addr) nounwind {
 define i16 @test16(ptr%addr, i16 %a) nounwind {
 ; KNL-LABEL: test16:
 ; KNL:       ## %bb.0:
+; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movw $-1025, %cx ## imm = 0xFBFF
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -349,10 +349,10 @@ define i16 @test16(ptr%addr, i16 %a) nounwind {
 define i8 @test17(ptr%addr, i8 %a) nounwind {
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
+; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    movw $-17, %cx
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -843,12 +843,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; KNL-LABEL: test_insertelement_v32i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
-; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    movw $-17, %dx
 ; KNL-NEXT:    kmovw %edx, %k1
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
@@ -862,12 +862,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX-LABEL: test_insertelement_v32i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
 ; SKX-NEXT:    movl $-17, %ecx
 ; SKX-NEXT:    kmovd %ecx, %k1
-; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftld $31, %k1, %k1
@@ -889,10 +889,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    movw $-5, %cx
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL-NEXT:    kshiftrw $13, %k1, %k1
@@ -905,10 +905,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX-LABEL: test_iinsertelement_v4i1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    setb %al
 ; SKX-NEXT:    movb $-5, %cl
 ; SKX-NEXT:    kmovd %ecx, %k1
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 373bc00d004bd6..9e689341f7b88e 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1266,10 +1266,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kmovw %edi, %k3
 ; KNL-NEXT:    cmpl %edx, %esi
+; KNL-NEXT:    setg %al
 ; KNL-NEXT:    movw $-33, %cx
 ; KNL-NEXT:    kmovw %ecx, %k4
 ; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    setg %al
 ; KNL-NEXT:    kmovw %eax, %k4
 ; KNL-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL-NEXT:    kshiftrw $10, %k4, %k4
@@ -1291,10 +1291,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
+; SKX-NEXT:    setg %al
 ; SKX-NEXT:    movq $-33, %rcx
 ; SKX-NEXT:    kmovq %rcx, %k1
 ; SKX-NEXT:    kandq %k1, %k0, %k0
-; SKX-NEXT:    setg %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlq $63, %k1, %k1
 ; SKX-NEXT:    kshiftrq $58, %k1, %k1
@@ -1306,10 +1306,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    setg %al
 ; AVX512BW-NEXT:    movq $-33, %rcx
 ; AVX512BW-NEXT:    kmovq %rcx, %k1
 ; AVX512BW-NEXT:    kandq %k1, %k0, %k0
-; AVX512BW-NEXT:    setg %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512BW-NEXT:    kshiftrq $58, %k1, %k1
@@ -1329,10 +1329,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512DQ-NEXT:    kmovw %eax, %k2
 ; AVX512DQ-NEXT:    kmovw %edi, %k3
 ; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    setg %al
 ; AVX512DQ-NEXT:    movw $-33, %cx
 ; AVX512DQ-NEXT:    kmovw %ecx, %k4
 ; AVX512DQ-NEXT:    kandw %k4, %k1, %k1
-; AVX512DQ-NEXT:    setg %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k4
 ; AVX512DQ-NEXT:    kshiftlw $15, %k4, %k4
 ; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k4
@@ -1355,11 +1355,11 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setg %al
 ; X86-NEXT:    kshiftrq $6, %k0, %k1
 ; X86-NEXT:    kshiftlq $6, %k1, %k1
 ; X86-NEXT:    kshiftlq $59, %k0, %k0
 ; X86-NEXT:    kshiftrq $59, %k0, %k0
-; X86-NEXT:    setg %al
 ; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    kshiftlq $63, %k1, %k1
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 9327ee800af19b..727e9ccfe27d5f 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1959,9 +1959,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
 ; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
 ; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
@@ -2134,9 +2134,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
 ; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
 ; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index 21c26f3cd78ba8..c0bb0037923dce 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -10720,9 +10720,9 @@ declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
 define i8 at test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10779,9 +10779,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
 define i8 at test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10810,9 +10810,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
 define i8 at test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestm_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10843,9 +10843,9 @@ declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10902,9 +10902,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
@@ -10933,9 +10933,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
 define i8 at test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 ; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 386b63d551988b..edf12d219c452e 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -184,10 +184,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 8a1d247543cf96..c8093c820e6a36 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -201,21 +201,19 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    orl $32, %ebp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovnel %ecx, %ebp
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    movl %esi, %ebx
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index c10a1ae9b8fb78..8767ea8a81beb3 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -232,10 +232,9 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -302,10 +301,9 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -439,10 +437,9 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    decl %esi
 ; X86-NOBMI-NEXT:    andl %edi, %esi
@@ -2344,10 +2341,9 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -2414,10 +2410,9 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -2551,10 +2546,9 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    notl %esi
 ; X86-NOBMI-NEXT:    andl %edi, %esi
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index ed09823b2b515f..5ed5c4515b3035 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -80,12 +80,12 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shldl $28, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $28, %ebx, %edx
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    shldl $28, %ecx, %ebx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    shldl $28, %edi, %esi
@@ -162,12 +162,12 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind {
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shldl $26, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $26, %ebx, %edx
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    shldl $26, %ecx, %ebx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    shldl $26, %edi, %esi
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 1f4aa669a67e5a..c44aa8e61db47a 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1263,14 +1263,15 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm4, %xmm14
 ; SSE-NEXT:    mulpd %xmm13, %xmm14
 ; SSE-NEXT:    addpd %xmm10, %xmm14
+; SSE-NEXT:    movapd %xmm6, %xmm4
 ; SSE-NEXT:    mulpd %xmm6, %xmm13
 ; SSE-NEXT:    addpd %xmm15, %xmm13
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT:    mulpd %xmm6, %xmm8
 ; SSE-NEXT:    movapd %xmm7, %xmm10
 ; SSE-NEXT:    mulpd %xmm8, %xmm10
 ; SSE-NEXT:    addpd %xmm13, %xmm10
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    mulpd %xmm6, %xmm8
 ; SSE-NEXT:    addpd %xmm14, %xmm8
 ; SSE-NEXT:    movapd %xmm12, %xmm13
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0]
@@ -1288,8 +1289,8 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm5, %xmm14
 ; SSE-NEXT:    mulpd %xmm13, %xmm14
 ; SSE-NEXT:    addpd %xmm12, %xmm14
-; SSE-NEXT:    mulpd %xmm6, %xmm13
-; SSE-NEXT:    movapd %xmm6, %xmm2
+; SSE-NEXT:    mulpd %xmm4, %xmm13
+; SSE-NEXT:    movapd %xmm4, %xmm2
 ; SSE-NEXT:    addpd %xmm15, %xmm13
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1,1]
 ; SSE-NEXT:    movapd %xmm7, %xmm12
@@ -1653,29 +1654,28 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1]
 ; SSE-NEXT:    movaps %xmm3, %xmm10
 ; SSE-NEXT:    movaps %xmm3, %xmm12
-; SSE-NEXT:    addps %xmm15, %xmm0
 ; SSE-NEXT:    mulps %xmm0, %xmm10
-; SSE-NEXT:    addps %xmm5, %xmm10
 ; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    addps %xmm15, %xmm0
 ; SSE-NEXT:    movaps %xmm14, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2]
 ; SSE-NEXT:    movaps %xmm4, %xmm2
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT:    movaps %xmm4, %xmm15
 ; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
 ; SSE-NEXT:    addps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm11, %xmm1
-; SSE-NEXT:    addps %xmm10, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3,3,3]
 ; SSE-NEXT:    movaps %xmm7, %xmm3
 ; SSE-NEXT:    mulps %xmm14, %xmm3
-; SSE-NEXT:    addps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm6, %xmm14
 ; SSE-NEXT:    addps %xmm2, %xmm14
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT:    addps %xmm5, %xmm10
+; SSE-NEXT:    movaps %xmm4, %xmm15
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT:    mulps %xmm11, %xmm1
+; SSE-NEXT:    addps %xmm10, %xmm1
+; SSE-NEXT:    addps %xmm1, %xmm3
 ; SSE-NEXT:    movaps %xmm5, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0]
-; SSE-NEXT:    movaps %xmm13, %xmm2
 ; SSE-NEXT:    mulps %xmm1, %xmm2
 ; SSE-NEXT:    addps %xmm14, %xmm2
 ; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
@@ -1913,9 +1913,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    movaps %xmm4, %xmm7
 ; SSE-NEXT:    mulps %xmm0, %xmm7
 ; SSE-NEXT:    addps %xmm1, %xmm7
-; SSE-NEXT:    movaps %xmm12, %xmm15
-; SSE-NEXT:    mulps %xmm6, %xmm0
 ; SSE-NEXT:    movaps %xmm6, %xmm3
+; SSE-NEXT:    mulps %xmm6, %xmm0
 ; SSE-NEXT:    addps %xmm2, %xmm0
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    movaps %xmm4, %xmm1
@@ -1956,6 +1955,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; SSE-NEXT:    mulps %xmm1, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm14
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm12, %xmm15
 ; SSE-NEXT:    movaps %xmm12, %xmm13
 ; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    mulps %xmm14, %xmm15
@@ -2797,9 +2797,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; AVX512F-NEXT:    vmulps %ymm1, %ymm10, %ymm1
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
-; AVX512F-NEXT:    vmulps %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
 ; AVX512F-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX512F-NEXT:    vmulps %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7]
 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
@@ -2813,9 +2812,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX512F-NEXT:    vmulps %ymm1, %ymm8, %ymm1
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
 ; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm12, %zmm3
-; AVX512F-NEXT:    vmovaps %zmm4, %zmm0
+; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
 ; AVX512F-NEXT:    vmovaps %zmm5, %zmm1
 ; AVX512F-NEXT:    retq
 ;
@@ -3528,16 +3527,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    movapd %xmm15, %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
 ; SSE-NEXT:    addpd %xmm3, %xmm4
-; SSE-NEXT:    movapd %xmm10, %xmm5
-; SSE-NEXT:    movapd %xmm13, %xmm3
-; SSE-NEXT:    mulpd %xmm1, %xmm5
-; SSE-NEXT:    addpd %xmm5, %xmm4
 ; SSE-NEXT:    movapd %xmm13, %xmm8
+; SSE-NEXT:    movapd %xmm13, %xmm3
 ; SSE-NEXT:    mulpd %xmm0, %xmm3
+; SSE-NEXT:    movapd %xmm10, %xmm5
 ; SSE-NEXT:    movapd %xmm10, %xmm15
+; SSE-NEXT:    mulpd %xmm1, %xmm5
 ; SSE-NEXT:    addpd %xmm3, %xmm5
 ; SSE-NEXT:    movapd %xmm12, %xmm10
 ; SSE-NEXT:    mulpd %xmm12, %xmm0
+; SSE-NEXT:    movapd %xmm14, %xmm9
 ; SSE-NEXT:    mulpd %xmm14, %xmm1
 ; SSE-NEXT:    addpd %xmm0, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
@@ -3587,24 +3586,24 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    addpd %xmm6, %xmm0
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
+; SSE-NEXT:    addpd %xmm5, %xmm4
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
 ; SSE-NEXT:    mulpd %xmm1, %xmm5
 ; SSE-NEXT:    addpd %xmm7, %xmm5
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
 ; SSE-NEXT:    mulpd %xmm2, %xmm1
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
-; SSE-NEXT:    movapd %xmm14, %xmm9
 ; SSE-NEXT:    addpd %xmm3, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
 ; SSE-NEXT:    movapd %xmm7, %xmm3
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT:    mulpd %xmm7, %xmm2
+; SSE-NEXT:    mulpd %xmm3, %xmm2
 ; SSE-NEXT:    addpd %xmm1, %xmm2
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    mulpd %xmm7, %xmm1
+; SSE-NEXT:    mulpd %xmm3, %xmm1
 ; SSE-NEXT:    addpd %xmm5, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT:    mulpd %xmm7, %xmm5
+; SSE-NEXT:    mulpd %xmm3, %xmm5
 ; SSE-NEXT:    addpd %xmm4, %xmm5
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    mulpd %xmm4, %xmm3
@@ -3749,6 +3748,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    mulpd %xmm1, %xmm2
 ; SSE-NEXT:    addpd %xmm3, %xmm2
 ; SSE-NEXT:    mulpd %xmm0, %xmm11
+; SSE-NEXT:    movapd %xmm13, %xmm6
 ; SSE-NEXT:    movapd %xmm13, %xmm4
 ; SSE-NEXT:    mulpd %xmm1, %xmm4
 ; SSE-NEXT:    addpd %xmm11, %xmm4
@@ -3792,7 +3792,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0]
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
 ; SSE-NEXT:    mulpd %xmm9, %xmm3
-; SSE-NEXT:    movapd %xmm13, %xmm6
 ; SSE-NEXT:    addpd %xmm0, %xmm3
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
 ; SSE-NEXT:    mulpd %xmm9, %xmm10
@@ -3854,20 +3853,20 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun
 ; SSE-NEXT:    mulpd %xmm1, %xmm2
 ; SSE-NEXT:    addpd %xmm3, %xmm2
 ; SSE-NEXT:    movapd %xmm6, %xmm7
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movapd %xmm14, %xmm3
 ; SSE-NEXT:    mulpd %xmm1, %xmm7
 ; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    movapd %xmm6, %xmm9
+; SSE-NEXT:    mulpd %xmm1, %xmm9
+; SSE-NEXT:    mulpd %xmm5, %xmm1
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT:    movapd %xmm14, %xmm3
 ; SSE-NEXT:    mulpd %xmm0, %xmm3
+; SSE-NEXT:    addpd %xmm3, %xmm7
+; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; SSE-NEXT:    movapd %xmm4, %xmm3
-; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT:    addpd %xmm4, %xmm7
 ; SSE-NEXT:    mulpd %xmm0, %xmm3
-; SSE-NEXT:    movapd %xmm6, %xmm9
-; SSE-NEXT:    mulpd %xmm1, %xmm9
 ; SSE-NEXT:    addpd %xmm3, %xmm9
 ; SSE-NEXT:    mulpd %xmm8, %xmm0
-; SSE-NEXT:    mulpd %xmm5, %xmm1
 ; SSE-NEXT:    addpd %xmm0, %xmm1
 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
 ; SSE-NEXT:    movapd %xmm0, %xmm10
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 65bd1df5821042..80a1a2a0554cc2 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -305,13 +305,13 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq 8(%rdi), %r11
 ; X64-NEXT:    movq 16(%rdi), %r10
 ; X64-NEXT:    movq (%rsi), %r9
-; X64-NEXT:    movq 8(%rsi), %r14
 ; X64-NEXT:    movq 24(%rdi), %r15
-; X64-NEXT:    imulq %r9, %r15
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq 16(%rsi), %r8
 ; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq 16(%rsi), %r8
+; X64-NEXT:    movq 8(%rsi), %r14
+; X64-NEXT:    imulq %r9, %r15
 ; X64-NEXT:    imulq %r14, %r10
 ; X64-NEXT:    addq %rdx, %r10
 ; X64-NEXT:    addq %r15, %r10
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 355f770fd17044..64f6746e616ede 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1218,11 +1218,11 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rbx, %rsi
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rbx, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index d7b8734e96e396..4d81193e1b2e58 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -273,7 +273,6 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    pushl %ebp
 ; CHECK32-NEXT:    pushl %ebx
-; CHECK32-NEXT:    pushl %edi
 ; CHECK32-NEXT:    pushl %esi
 ; CHECK32-NEXT:    pushl %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
@@ -282,12 +281,11 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movl %ebp, %edx
-; CHECK32-NEXT:    movl %edi, %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
@@ -304,7 +302,6 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32-NEXT:    xorl %edx, %edx
 ; CHECK32-NEXT:    addl $4, %esp
 ; CHECK32-NEXT:    popl %esi
-; CHECK32-NEXT:    popl %edi
 ; CHECK32-NEXT:    popl %ebx
 ; CHECK32-NEXT:    popl %ebp
 ; CHECK32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index b60b36744f038b..f2b990f0bad62e 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -1114,9 +1114,8 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: zext_mulhuw_v32i16_lshr:
@@ -1209,9 +1208,8 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mulhsw_v32i16_lshr:
@@ -1310,9 +1308,8 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mulhsw_v32i16_ashr:
@@ -1484,10 +1481,9 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vpmulhuw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    retq
 ;
@@ -1665,10 +1661,9 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    retq
 ;
@@ -1847,10 +1842,9 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
 ; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index ea72b64ee00c01..29922c2ac1a716 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -49,13 +49,13 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
 ; AVX512VL-LABEL: test:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
+; AVX512VL-NEXT:    kshiftrb $2, %k0, %k1
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    testb $2, %al
 ; AVX512VL-NEXT:    fld1
 ; AVX512VL-NEXT:    fldz
 ; AVX512VL-NEXT:    fld %st(0)
 ; AVX512VL-NEXT:    fcmovne %st(2), %st
-; AVX512VL-NEXT:    kshiftrb $2, %k0, %k1
 ; AVX512VL-NEXT:    testb $1, %al
 ; AVX512VL-NEXT:    fld %st(1)
 ; AVX512VL-NEXT:    fcmovne %st(3), %st
diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll
index 4fea9c8cffec6f..afb4a51769d619 100644
--- a/llvm/test/CodeGen/X86/pr61964.ll
+++ b/llvm/test/CodeGen/X86/pr61964.ll
@@ -22,9 +22,8 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splitTransposeDecode_8_avx2:
@@ -58,9 +57,8 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; XOPAVX1-NEXT:    vmovaps %ymm2, %ymm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splitTransposeDecode_8_avx2:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 779999816ebbf2..36866cf47aa25c 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -428,7 +428,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    sarl $31, %eax
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -450,9 +449,9 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shll %cl, %edx
 ; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebp, %eax
-; i686-NEXT:    movl %ebp, %edx
+; i686-NEXT:    movl %eax, %edx
 ; i686-NEXT:    andl $7, %edx
 ; i686-NEXT:    shrl $3, %eax
 ; i686-NEXT:    andl $15, %eax
@@ -551,23 +550,23 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $100, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, %ecx
+; i686-NEXT:    movl %ecx, %ebp
 ; i686-NEXT:    shrl $3, %ebp
 ; i686-NEXT:    andl $15, %ebp
 ; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index 0771beecda770c..2a748b60524215 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -166,17 +166,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $30, %eax, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    shldl $30, %eax, %ebp
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 14db7ac90ef570..177543aff1e8ae 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -265,7 +265,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
@@ -279,12 +278,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdl $2, %edx, %edi
 ; X86-NEXT:    cmpl $2, %edx
 ; X86-NEXT:    cmovgel %ebp, %edi
 ; X86-NEXT:    cmpl $-2, %edx
 ; X86-NEXT:    cmovll %esi, %edi
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    shrdl $2, %edx, %ebx
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 8a5d3bb0936775..b4f12123424c61 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -91,12 +91,11 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
@@ -585,12 +584,11 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
@@ -1293,9 +1291,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 554548fa8f4c33..60b9b62e94ab2c 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -1661,43 +1661,41 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl 11(%rdi), %eax
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
 ; SCALAR-NEXT:    movzbl 10(%rdi), %ebp
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
 ; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl 9(%rdi), %r14d
+; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    movzbl 9(%rdi), %r10d
 ; SCALAR-NEXT:    movzbl 8(%rdi), %eax
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl 7(%rdi), %r12d
 ; SCALAR-NEXT:    movzbl 5(%rdi), %r9d
 ; SCALAR-NEXT:    movzbl 4(%rdi), %ebx
-; SCALAR-NEXT:    movzbl 12(%rdi), %r15d
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
 ; SCALAR-NEXT:    movzbl 1(%rdi), %r13d
+; SCALAR-NEXT:    movzbl 12(%rdi), %edi
 ; SCALAR-NEXT:    notb %al
 ; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r13b
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %bl
 ; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r12b
 ; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
 ; SCALAR-NEXT:    notb %r11b
-; SCALAR-NEXT:    movl %r14d, %r10d
 ; SCALAR-NEXT:    notb %r10b
 ; SCALAR-NEXT:    notb %bpl
 ; SCALAR-NEXT:    movl %ebp, %r14d
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
 ; SCALAR-NEXT:    notb %r8b
 ; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movl %r15d, %edi
 ; SCALAR-NEXT:    notb %dil
 ; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
@@ -4757,9 +4755,9 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
 ; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
 ; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl 5(%rdi), %r15d
+; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    movzbl 5(%rdi), %r9d
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
@@ -4771,8 +4769,7 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movl %r15d, %r9d
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    notb %r10b
 ; SCALAR-NEXT:    notb %bl
@@ -6695,9 +6692,9 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl 7(%rdi), %ebp
 ; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
 ; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
-; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl 5(%rdi), %ebx
+; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT:    movzbl 5(%rdi), %r9d
 ; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
 ; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
 ; SCALAR-NEXT:    movzbl (%rdi), %eax
@@ -6709,8 +6706,7 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    notb %cl
 ; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT:    movl %ebx, %r9d
+; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    notb %r9b
 ; SCALAR-NEXT:    notb %r11b
 ; SCALAR-NEXT:    movl %r11d, %ebx
@@ -6762,13 +6758,14 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movb %r8b, 14(%rdx)
 ; SCALAR-NEXT:    movb %al, 13(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT:    movb %r8b, 4(%rdx)
-; SCALAR-NEXT:    movb %al, 12(%rdx)
+; SCALAR-NEXT:    movb %sil, 9(%rdx)
+; SCALAR-NEXT:    movl %r10d, %esi
+; SCALAR-NEXT:    movb %r10b, (%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SCALAR-NEXT:    movb %al, 12(%rdx)
 ; SCALAR-NEXT:    movb %r13b, 11(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %r15b, 10(%rdx)
-; SCALAR-NEXT:    movb %sil, 9(%rdx)
 ; SCALAR-NEXT:    movb %r12b, 8(%rdx)
 ; SCALAR-NEXT:    movb %r14b, 7(%rdx)
 ; SCALAR-NEXT:    movb %bl, 6(%rdx)
@@ -6776,13 +6773,11 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movl %r9d, %r11d
 ; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SCALAR-NEXT:    movb %r8b, 4(%rdx)
 ; SCALAR-NEXT:    movb %bpl, 3(%rdx)
 ; SCALAR-NEXT:    movb %dil, 2(%rdx)
 ; SCALAR-NEXT:    movb %cl, 1(%rdx)
 ; SCALAR-NEXT:    movl %ecx, %r14d
-; SCALAR-NEXT:    movb %r10b, (%rdx)
-; SCALAR-NEXT:    movb %al, 22(%rdx)
-; SCALAR-NEXT:    movl %r10d, %esi
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %cl, 31(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
@@ -6800,6 +6795,7 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; SCALAR-NEXT:    movb %bl, 23(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SCALAR-NEXT:    movb %al, 22(%rdx)
 ; SCALAR-NEXT:    movb %r11b, 21(%rdx)
 ; SCALAR-NEXT:    movb %r8b, 20(%rdx)
 ; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index f357e57b305991..e255855d382327 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -122,17 +122,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl $30, %eax, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    shldl $30, %eax, %ebp
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index f40276822b3fa7..4c54e06786cccc 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -194,7 +194,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %esi
@@ -205,10 +204,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmovael %ecx, %esi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdl $2, %edx, %ebx
 ; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    cmovael %ecx, %ebx
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    shrdl $2, %edx, %ebp
@@ -390,7 +389,6 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %esi
@@ -399,8 +397,8 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmovol %edi, %esi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovol %edi, %ebx
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    cmovol %edi, %ebp
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 00898ab313a3cc..b7fe4f0062a40e 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -526,10 +526,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm0
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v6i32:
@@ -544,11 +544,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm0
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: umulo_v6i32:
@@ -697,10 +696,10 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm0
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: umulo_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index e71aa794640f75..72c9fcceb6933d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -947,16 +947,16 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; SSE-NEXT:    movdqa (%rdi), %xmm10
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm13
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm13
-; SSE-NEXT:    movdqa 16(%rdi), %xmm7
 ; SSE-NEXT:    psrad $16, %xmm13
 ; SSE-NEXT:    packssdw %xmm0, %xmm13
+; SSE-NEXT:    movdqa 16(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 26f2f17ad24046..e3d1d668b9d43b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -1233,8 +1233,6 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $40, %rsp
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm6
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa (%rdi), %xmm15
@@ -1244,59 +1242,60 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm12
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm12, %xmm2
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm2, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    movdqa 160(%rdi), %xmm9
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1]
-; SSE-NEXT:    movdqa %xmm11, %xmm8
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm10, %xmm2
 ; SSE-NEXT:    movdqa %xmm15, %xmm3
 ; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
+; SSE-NEXT:    movdqa 160(%rdi), %xmm7
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm10, %xmm11
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
-; SSE-NEXT:    movdqa 96(%rdi), %xmm5
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
 ; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm9, %xmm7
-; SSE-NEXT:    pandn %xmm9, %xmm2
+; SSE-NEXT:    pandn %xmm7, %xmm2
 ; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm2, %xmm3
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
-; SSE-NEXT:    movdqa 112(%rdi), %xmm6
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, %xmm8
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, %xmm11
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 112(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm5, %xmm2
 ; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm6, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 0f9f83bafdf93f..7a6329816a9563 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1880,10 +1880,10 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpackusdw %xmm7, %xmm8, %xmm7
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 22262b414df1a2..22b987544e1c6d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -2416,27 +2416,27 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm12
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm3
-; SSE-NEXT:    movaps 192(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 192(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa 48(%rdi), %xmm5
 ; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
 ; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
-; SSE-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
+; SSE-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
 ; SSE-NEXT:    movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    andps %xmm15, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
 ; SSE-NEXT:    movaps %xmm15, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm3, %xmm2
@@ -3675,13 +3675,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm7
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm6
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
 ; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
@@ -4888,35 +4888,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 464(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm6
-; SSE-NEXT:    movdqa 416(%rdi), %xmm11
-; SSE-NEXT:    movdqa 448(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 432(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm14
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    movdqa 400(%rdi), %xmm8
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3]
-; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
+; SSE-NEXT:    movdqa 400(%rdi), %xmm8
+; SSE-NEXT:    movdqa 416(%rdi), %xmm11
+; SSE-NEXT:    movdqa 448(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 432(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
 ; SSE-NEXT:    movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    andps %xmm13, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
 ; SSE-NEXT:    movaps %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm3, %xmm2
@@ -6561,7 +6561,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    subq $1048, %rsp # imm = 0x418
 ; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
@@ -6573,7 +6572,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
@@ -6587,6 +6585,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa %ymm7, %ymm11
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index a53ca1ccaa668c..e55f8be61bc53e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -1580,22 +1580,18 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $136, %rsp
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm2
-; SSE-NEXT:    movdqa 128(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa (%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 48(%rdi), %xmm8
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
 ; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    movdqa 112(%rdi), %xmm9
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
@@ -1605,16 +1601,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; SSE-NEXT:    movdqa %xmm11, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0]
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pslld $16, %xmm11
 ; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; SSE-NEXT:    movdqa 48(%rdi), %xmm8
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3]
+; SSE-NEXT:    movdqa 112(%rdi), %xmm11
+; SSE-NEXT:    movdqa 128(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1625,10 +1627,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm9, %xmm11
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
 ; SSE-NEXT:    pand %xmm10, %xmm2
 ; SSE-NEXT:    por %xmm3, %xmm2
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm14
@@ -1664,7 +1664,6 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    pandn %xmm15, %xmm13
 ; SSE-NEXT:    pand %xmm10, %xmm9
 ; SSE-NEXT:    por %xmm13, %xmm9
-; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0]
 ; SSE-NEXT:    movdqa %xmm11, %xmm4
 ; SSE-NEXT:    psrld $16, %xmm4
@@ -4165,15 +4164,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm3
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
 ; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -4183,9 +4182,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm11, %ymm5
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm9
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index ec9f87b201a958..312fa596cd1fe2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -2028,24 +2028,25 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 160(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm13
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm14
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
-; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movaps 144(%rdi), %xmm7
 ; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm10
 ; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    movaps 144(%rdi), %xmm7
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, %xmm10
 ; SSE-NEXT:    movaps %xmm7, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2]
 ; SSE-NEXT:    movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
@@ -4151,9 +4152,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm12
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm11
@@ -4161,14 +4160,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE-NEXT:    movaps 144(%rdi), %xmm10
 ; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    movaps 144(%rdi), %xmm10
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm10, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2]
 ; SSE-NEXT:    movaps %xmm7, %xmm10
@@ -7702,7 +7703,6 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm25, %zmm11
 ; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm12
@@ -7714,6 +7714,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm12, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm25, %zmm11
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
@@ -8632,12 +8633,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    movdqa 176(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    pand %xmm3, %xmm0
@@ -13252,22 +13253,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $1544, %rsp # imm = 0x608
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm11
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm11
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
 ; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
@@ -13278,12 +13272,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm2
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
@@ -14108,7 +14109,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm19
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX512-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm9
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm11
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm21
@@ -14174,6 +14176,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 640(%rdi), %ymm16
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
 ; AVX512-NEXT:    vmovdqa 688(%rdi), %xmm3
@@ -14191,8 +14194,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
 ; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm14
 ; AVX512-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
@@ -14479,6 +14480,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti32x4 $1, %ymm12, %xmm25
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
@@ -14486,7 +14488,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index ff1e9cf28f2ea8..7765b40fe4f294 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -3459,27 +3459,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm5
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm15
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm7
-; SSE-NEXT:    movdqa 144(%rdi), %xmm6
-; SSE-NEXT:    movdqa 128(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm0
-; SSE-NEXT:    movdqa 192(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; SSE-NEXT:    movdqa %xmm0, %xmm7
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0]
+; SSE-NEXT:    movdqa 192(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 144(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 128(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm15
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0]
@@ -4474,8 +4474,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -4498,8 +4500,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4935,8 +4935,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -4959,8 +4961,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5396,8 +5396,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -5420,8 +5422,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6288,7 +6288,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
@@ -6298,9 +6297,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
 ; AVX512-FCP-NEXT:    vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -7061,7 +7060,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
@@ -7071,9 +7069,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -12887,9 +12885,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
 ; AVX512-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
 ; AVX512-NEXT:    vmovdqa 560(%rdi), %xmm1
@@ -12975,15 +12974,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vmovdqa %xmm8, %xmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -13056,6 +13052,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -13389,11 +13386,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -13456,6 +13452,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
@@ -13852,6 +13849,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
@@ -13860,9 +13858,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
@@ -14727,9 +14724,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpermt2d %xmm1, %xmm10, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa 560(%rdi), %xmm1
@@ -14815,15 +14813,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -14896,6 +14891,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm8 = mem[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -15229,11 +15225,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm6
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm5
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -15296,6 +15291,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm20
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm10 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX512DQ-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
@@ -15692,6 +15688,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpermt2d %xmm17, %xmm13, %xmm9
@@ -15700,9 +15697,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
 ; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm12
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 65e3ba8b8200b1..515ee69203edd7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -728,8 +728,6 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-LABEL: load_i32_stride3_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps 144(%rdi), %xmm11
-; SSE-NEXT:    movaps 128(%rdi), %xmm1
-; SSE-NEXT:    movaps 112(%rdi), %xmm13
 ; SSE-NEXT:    movaps 176(%rdi), %xmm10
 ; SSE-NEXT:    movaps 160(%rdi), %xmm9
 ; SSE-NEXT:    movaps (%rdi), %xmm7
@@ -738,26 +736,28 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 48(%rdi), %xmm15
 ; SSE-NEXT:    movaps 80(%rdi), %xmm14
+; SSE-NEXT:    movaps 64(%rdi), %xmm2
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0]
 ; SSE-NEXT:    movaps %xmm15, %xmm5
-; SSE-NEXT:    movaps 64(%rdi), %xmm2
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm0
-; SSE-NEXT:    movaps %xmm7, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm8[0,2]
-; SSE-NEXT:    movaps 96(%rdi), %xmm6
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0]
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm11, %xmm3
-; SSE-NEXT:    movaps %xmm11, %xmm4
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 96(%rdi), %xmm6
+; SSE-NEXT:    movaps 128(%rdi), %xmm1
+; SSE-NEXT:    movaps 112(%rdi), %xmm13
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm11, %xmm4
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm13, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
 ; SSE-NEXT:    movaps %xmm1, %xmm12
@@ -1199,41 +1199,41 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $392, %rsp # imm = 0x188
 ; SSE-NEXT:    movaps 240(%rdi), %xmm7
-; SSE-NEXT:    movaps 224(%rdi), %xmm3
-; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 208(%rdi), %xmm14
 ; SSE-NEXT:    movaps 272(%rdi), %xmm6
 ; SSE-NEXT:    movaps 256(%rdi), %xmm9
 ; SSE-NEXT:    movaps 48(%rdi), %xmm2
-; SSE-NEXT:    movaps 32(%rdi), %xmm11
-; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 192(%rdi), %xmm4
+; SSE-NEXT:    movaps 80(%rdi), %xmm1
 ; SSE-NEXT:    movaps 64(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, %xmm0
-; SSE-NEXT:    movaps 80(%rdi), %xmm1
-; SSE-NEXT:    movaps (%rdi), %xmm13
-; SSE-NEXT:    movaps %xmm1, %xmm12
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm1, %xmm12
 ; SSE-NEXT:    movaps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm9, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,2]
-; SSE-NEXT:    movaps 16(%rdi), %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0]
-; SSE-NEXT:    movaps %xmm6, %xmm10
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm7, %xmm1
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%rdi), %xmm8
+; SSE-NEXT:    movaps 32(%rdi), %xmm11
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0]
+; SSE-NEXT:    movaps 192(%rdi), %xmm4
+; SSE-NEXT:    movaps 224(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rdi), %xmm13
 ; SSE-NEXT:    movaps %xmm13, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
+; SSE-NEXT:    movaps 208(%rdi), %xmm14
+; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm6, %xmm10
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm14, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
@@ -1763,13 +1763,13 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm3
-; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm3
 ; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm0
@@ -2146,21 +2146,21 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 448(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm11, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movaps 48(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 272(%rdi), %xmm3
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 80(%rdi), %xmm1
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 64(%rdi), %xmm12
 ; SSE-NEXT:    movaps %xmm12, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0]
 ; SSE-NEXT:    movaps %xmm9, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 240(%rdi), %xmm7
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0]
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 272(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 256(%rdi), %xmm13
+; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm13, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0]
 ; SSE-NEXT:    movaps %xmm7, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index aa23dcc824c72d..1cb51b42d9bc1c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -1035,25 +1035,25 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
 ; AVX-NEXT:    vmovaps %ymm3, %ymm14
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps %ymm1, %ymm15
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
 ; AVX-NEXT:    vmovaps %ymm2, %ymm10
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4]
 ; AVX-NEXT:    vmovaps %ymm1, %ymm3
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 176(%rdi), %xmm6
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0]
 ; AVX-NEXT:    vmovaps %xmm6, %xmm2
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm4
-; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 144(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm6
-; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0]
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4]
+; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1]
@@ -1777,16 +1777,11 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 304(%rdi), %xmm8
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 336(%rdi), %xmm10
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 320(%rdi), %xmm6
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 368(%rdi), %xmm11
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 352(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 80(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 64(%rdi), %xmm1
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 112(%rdi), %xmm4
@@ -1798,17 +1793,22 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm1
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 288(%rdi), %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
 ; SSE-NEXT:    movaps %xmm6, %xmm5
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 288(%rdi), %xmm2
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, %xmm0
@@ -3516,14 +3516,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 176(%rdi), %xmm11
 ; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 208(%rdi), %xmm3
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 80(%rdi), %xmm10
 ; SSE-NEXT:    movaps 240(%rdi), %xmm6
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 80(%rdi), %xmm10
 ; SSE-NEXT:    movaps 64(%rdi), %xmm4
 ; SSE-NEXT:    movaps 112(%rdi), %xmm2
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3532,21 +3527,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
-; SSE-NEXT:    movaps 224(%rdi), %xmm7
 ; SSE-NEXT:    movaps %xmm2, %xmm1
-; SSE-NEXT:    movaps 192(%rdi), %xmm8
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 224(%rdi), %xmm7
 ; SSE-NEXT:    movaps %xmm7, %xmm1
-; SSE-NEXT:    movaps 160(%rdi), %xmm5
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
+; SSE-NEXT:    movaps 192(%rdi), %xmm8
 ; SSE-NEXT:    movaps %xmm8, %xmm6
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm6, %xmm0
 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 160(%rdi), %xmm5
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm0
@@ -4674,7 +4674,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm9
-; AVX2-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-NEXT:    vmovaps 224(%rdi), %ymm14
@@ -4682,14 +4681,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-NEXT:    vpermps %ymm10, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; AVX2-NEXT:    vpermps %ymm9, %ymm2, %ymm3
+; AVX2-NEXT:    vmovaps 448(%rdi), %ymm4
+; AVX2-NEXT:    vmovaps 480(%rdi), %ymm15
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5167,7 +5167,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm14
@@ -5175,14 +5174,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vpermps %ymm10, %ymm2, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; AVX2-FP-NEXT:    vpermps %ymm9, %ymm2, %ymm3
+; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm15
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5660,7 +5660,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm14
@@ -5668,14 +5667,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm2, %ymm0
 ; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 144(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm2, %ymm3
+; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index 1238b1c0976286..c89040d9388d00 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -688,41 +688,41 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm5
-; SSE-NEXT:    movapd 80(%rdi), %xmm11
+; SSE-NEXT:    movdqa 80(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm1
 ; SSE-NEXT:    movdqa (%rdi), %xmm14
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm7
-; SSE-NEXT:    movdqa 48(%rdi), %xmm2
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm9
+; SSE-NEXT:    movdqa 48(%rdi), %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm14, %xmm8
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[2,2,2,2]
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT:    movdqa 64(%rdi), %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; SSE-NEXT:    movapd %xmm11, %xmm10
 ; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1]
-; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; SSE-NEXT:    movdqa %xmm11, %xmm10
+; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1]
-; SSE-NEXT:    movdqa 144(%rdi), %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; SSE-NEXT:    movdqa 64(%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[2,3,2,3]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm3[0,0,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[2,3,2,3]
+; SSE-NEXT:    movdqa 144(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm3[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3]
@@ -1294,22 +1294,23 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 256(%rdi), %xmm8
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm6
-; SSE-NEXT:    movdqa 16(%rdi), %xmm15
-; SSE-NEXT:    movdqa 32(%rdi), %xmm5
-; SSE-NEXT:    movdqa 48(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movapd 160(%rdi), %xmm10
+; SSE-NEXT:    movdqa 192(%rdi), %xmm7
+; SSE-NEXT:    movdqa 160(%rdi), %xmm10
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
+; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa (%rdi), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE-NEXT:    movdqa (%rdi), %xmm11
+; SSE-NEXT:    movdqa 16(%rdi), %xmm15
+; SSE-NEXT:    movdqa 32(%rdi), %xmm5
+; SSE-NEXT:    movdqa 48(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm7, %xmm9
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
@@ -1343,7 +1344,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1666,10 +1667,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm9
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
@@ -1683,7 +1683,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1705,6 +1704,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm13
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
@@ -1714,8 +1715,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-NEXT:    vmovdqa %ymm1, %ymm12
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-NEXT:    vpermd %ymm12, %ymm0, %ymm15
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
@@ -1784,10 +1784,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
@@ -1801,7 +1800,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1823,6 +1821,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
@@ -1832,8 +1832,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm12
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm0, %ymm15
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
@@ -1902,10 +1901,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
@@ -1919,7 +1917,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1941,6 +1938,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
@@ -1950,8 +1949,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm12
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm15
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
@@ -2520,40 +2518,42 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $904, %rsp # imm = 0x388
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm5
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 32(%rdi), %xmm9
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 48(%rdi), %xmm8
 ; SSE-NEXT:    movdqa 448(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 432(%rdi), %xmm4
-; SSE-NEXT:    movdqa 416(%rdi), %xmm14
-; SSE-NEXT:    movapd 128(%rdi), %xmm6
-; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 128(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm7
+; SSE-NEXT:    movdqa 80(%rdi), %xmm12
+; SSE-NEXT:    movdqa 96(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movdqa 96(%rdi), %xmm15
-; SSE-NEXT:    movapd 80(%rdi), %xmm12
-; SSE-NEXT:    movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm15
+; SSE-NEXT:    movdqa %xmm12, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa 400(%rdi), %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1]
-; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movdqa (%rdi), %xmm11
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 400(%rdi), %xmm10
+; SSE-NEXT:    movdqa 416(%rdi), %xmm14
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    movdqa %xmm10, %xmm6
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
+; SSE-NEXT:    movdqa (%rdi), %xmm11
+; SSE-NEXT:    movdqa 32(%rdi), %xmm9
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 48(%rdi), %xmm8
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, %xmm6
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    movdqa %xmm11, %xmm5
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2562,11 +2562,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 336(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 320(%rdi), %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm1, %xmm9
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 320(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movdqa 368(%rdi), %xmm2
@@ -3353,16 +3353,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm14
+; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -3370,13 +3366,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
@@ -3632,16 +3632,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -3649,13 +3645,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
@@ -3911,16 +3911,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
@@ -3928,13 +3924,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
@@ -5005,22 +5005,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 432(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm7
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 416(%rdi), %xmm9
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    movdqa 400(%rdi), %xmm10
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    movdqa 400(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 416(%rdi), %xmm9
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
@@ -6742,34 +6742,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm11
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
-; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
+; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -7299,34 +7299,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm11
-; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
+; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -7856,34 +7856,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 864e41510030b2..1a56d2a4b452dd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -1664,47 +1664,47 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm9
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 336(%rdi), %xmm14
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 304(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 352(%rdi), %xmm5
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm12
-; SSE-NEXT:    movdqa 288(%rdi), %xmm15
+; SSE-NEXT:    movdqa (%rdi), %xmm8
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    movapd (%rdi), %xmm8
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa 208(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    movdqa 256(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1]
+; SSE-NEXT:    movdqa 352(%rdi), %xmm5
+; SSE-NEXT:    movdqa 288(%rdi), %xmm15
+; SSE-NEXT:    movdqa 304(%rdi), %xmm7
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 256(%rdi), %xmm2
+; SSE-NEXT:    movdqa 192(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 208(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm4, %xmm14
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3]
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
@@ -2145,13 +2145,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm4
-; AVX2-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm13
 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
@@ -2160,6 +2155,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovaps %ymm0, %ymm7
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2322,13 +2322,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
@@ -2337,6 +2332,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm7
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2502,18 +2502,17 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
 ; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm7
-; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm5
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7]
@@ -3397,23 +3396,25 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 496(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 112(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE-NEXT:    movdqa 96(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 480(%rdi), %xmm8
-; SSE-NEXT:    movdqa 112(%rdi), %xmm11
+; SSE-NEXT:    movdqa %xmm1, %xmm11
+; SSE-NEXT:    movdqa %xmm6, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
 ; SSE-NEXT:    movdqa %xmm2, %xmm6
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE-NEXT:    movdqa 480(%rdi), %xmm8
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    movdqa %xmm8, %xmm4
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index b268c4a984cc19..a69825205cc04f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -2019,26 +2019,20 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $440, %rsp # imm = 0x1B8
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm6
-; SSE-NEXT:    movdqa 272(%rdi), %xmm5
-; SSE-NEXT:    movdqa 224(%rdi), %xmm15
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm7
 ; SSE-NEXT:    movdqa (%rdi), %xmm2
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm14
+; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm4
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm0
-; SSE-NEXT:    movdqa 304(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm11, %xmm12
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2046,12 +2040,18 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm9, %xmm11
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
+; SSE-NEXT:    movdqa 304(%rdi), %xmm3
+; SSE-NEXT:    movdqa 272(%rdi), %xmm5
+; SSE-NEXT:    movdqa 224(%rdi), %xmm15
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, %xmm12
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm11
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
@@ -4211,41 +4211,41 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 640(%rdi), %xmm3
-; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE-NEXT:    movdqa 48(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 608(%rdi), %xmm4
-; SSE-NEXT:    movdqa 576(%rdi), %xmm1
 ; SSE-NEXT:    movdqa 560(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 576(%rdi), %xmm1
+; SSE-NEXT:    movdqa 192(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm9
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 192(%rdi), %xmm7
-; SSE-NEXT:    movaps 128(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
+; SSE-NEXT:    movdqa 128(%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT:    movdqa %xmm7, %xmm14
-; SSE-NEXT:    movdqa (%rdi), %xmm13
-; SSE-NEXT:    movdqa %xmm9, %xmm12
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
 ; SSE-NEXT:    movdqa %xmm10, %xmm2
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT:    movdqa (%rdi), %xmm13
+; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm12
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, %xmm14
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm7
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
 ; SSE-NEXT:    movdqa %xmm13, %xmm2
@@ -8552,22 +8552,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movdqa 608(%rdi), %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 576(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 112(%rdi), %xmm1
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm15
+; SSE-NEXT:    movdqa 112(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE-NEXT:    movdqa 560(%rdi), %xmm10
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    movdqa 560(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 576(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
@@ -11205,20 +11205,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm3
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
-; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm11
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm3
@@ -12235,20 +12235,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm12
 ; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
-; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm11
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm3
@@ -13263,21 +13263,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
-; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
@@ -13289,9 +13287,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 752(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 9448acd134008d..68e68f62cfd000 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -2429,12 +2429,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5]
 ; AVX-NEXT:    vmovaps 480(%rdi), %ymm9
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
 ; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
@@ -4402,38 +4402,38 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 608(%rdi), %xmm6
 ; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 672(%rdi), %xmm8
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 640(%rdi), %xmm4
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 736(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 704(%rdi), %xmm3
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm10
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 224(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 192(%rdi), %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 128(%rdi), %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    movaps 128(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm1, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 576(%rdi), %xmm7
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm4, %xmm3
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 576(%rdi), %xmm7
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm7, %xmm0
@@ -9222,38 +9222,38 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; SSE-NEXT:    movaps 352(%rdi), %xmm5
 ; SSE-NEXT:    movaps %xmm5, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movaps 416(%rdi), %xmm7
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 384(%rdi), %xmm8
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 480(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 448(%rdi), %xmm3
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 160(%rdi), %xmm10
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 224(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 192(%rdi), %xmm0
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 128(%rdi), %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    movaps 128(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm1, %xmm2
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 320(%rdi), %xmm6
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
 ; SSE-NEXT:    movaps %xmm8, %xmm3
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 320(%rdi), %xmm6
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm6, %xmm0
@@ -15802,49 +15802,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512-NEXT:    movb $-64, %al
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -15853,21 +15847,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -16399,49 +16398,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    movb $-64, %al
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -16450,21 +16443,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -16996,49 +16994,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512DQ-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    movb $-64, %al
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512DQ-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512DQ-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -17047,21 +17039,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512DQ-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512DQ-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512DQ-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512DQ-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512DQ-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -17593,49 +17590,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -17644,21 +17635,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18190,49 +18186,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -18241,21 +18231,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18787,49 +18782,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    movb $-64, %al
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -18838,21 +18827,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19384,49 +19378,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512DQ-BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -19435,21 +19423,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512DQ-BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512DQ-BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19981,49 +19974,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
@@ -20032,21 +20019,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 9ed29fc54dbc1f..ae144d9e4f4982 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -1418,22 +1418,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-NEXT:    vmovaps 384(%rdi), %xmm7
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
 ; AVX2-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm15
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 288(%rdi), %xmm14
-; AVX2-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 256(%rdi), %xmm10
 ; AVX2-NEXT:    vinsertf128 $1, 320(%rdi), %ymm10, %ymm0
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 288(%rdi), %xmm14
+; AVX2-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
@@ -1443,8 +1443,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm5, %ymm10
-; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2]
+; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
@@ -1536,22 +1535,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-FP-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm7
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-FP-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
 ; AVX2-FP-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm15
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm14
-; AVX2-FP-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 256(%rdi), %xmm10
 ; AVX2-FP-NEXT:    vinsertf128 $1, 320(%rdi), %ymm10, %ymm0
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm14
+; AVX2-FP-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
@@ -1561,8 +1560,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm5, %ymm10
-; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2]
+; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
@@ -1654,22 +1652,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
 ; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm8
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm13
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm15
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm14
-; AVX2-FCP-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %xmm10
 ; AVX2-FCP-NEXT:    vinsertf128 $1, 320(%rdi), %ymm10, %ymm0
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vinsertf128 $1, 352(%rdi), %ymm14, %ymm1
 ; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
@@ -1679,8 +1677,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm10
-; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2]
+; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 1ef07aabc54c91..70b046ac4d1fe1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -798,18 +798,18 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-LABEL: load_i64_stride5_vf8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm1
-; AVX-NEXT:    vmovaps 96(%rdi), %ymm2
-; AVX-NEXT:    vmovapd 224(%rdi), %ymm9
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT:    vmovapd 96(%rdi), %ymm2
 ; AVX-NEXT:    vmovapd 64(%rdi), %ymm7
-; AVX-NEXT:    vmovaps (%rdi), %xmm10
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3]
+; AVX-NEXT:    vmovapd (%rdi), %xmm10
+; AVX-NEXT:    vmovapd 32(%rdi), %xmm4
+; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
+; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovapd 256(%rdi), %ymm0
+; AVX-NEXT:    vmovapd 224(%rdi), %ymm9
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm8
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm11
-; AVX-NEXT:    vmovapd 256(%rdi), %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm4[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3]
 ; AVX-NEXT:    vmovapd %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 192(%rdi), %xmm5
@@ -849,9 +849,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm15[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3]
+; AVX-NEXT:    vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
@@ -865,7 +865,7 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd %ymm11, 32(%r8)
 ; AVX-NEXT:    vmovapd %ymm8, (%r8)
 ; AVX-NEXT:    vmovapd %ymm0, 32(%r9)
-; AVX-NEXT:    vmovaps %ymm1, (%r9)
+; AVX-NEXT:    vmovapd %ymm1, (%r9)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -10965,11 +10965,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -11433,11 +11433,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -11901,11 +11901,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -12369,11 +12369,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -12837,11 +12837,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -13305,11 +13305,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -13773,11 +13773,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
@@ -14241,11 +14241,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 3dbd078504caa3..ee5a98e0e59c19 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -922,18 +922,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movaps 240(%rdi), %xmm3
 ; SSE-NEXT:    movaps 192(%rdi), %xmm12
 ; SSE-NEXT:    movaps 336(%rdi), %xmm4
-; SSE-NEXT:    movaps %xmm9, %xmm14
 ; SSE-NEXT:    movaps 288(%rdi), %xmm9
+; SSE-NEXT:    movaps %xmm9, %xmm14
 ; SSE-NEXT:    movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0]
-; SSE-NEXT:    movaps 96(%rdi), %xmm11
-; SSE-NEXT:    movaps %xmm14, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1]
 ; SSE-NEXT:    movaps %xmm12, %xmm4
 ; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1]
+; SSE-NEXT:    movaps 96(%rdi), %xmm3
+; SSE-NEXT:    movaps %xmm14, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm11, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm11
 ; SSE-NEXT:    movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2503,8 +2503,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $488, %rsp # imm = 0x1E8
 ; AVX2-NEXT:    vmovaps 512(%rdi), %ymm7
-; AVX2-NEXT:    vmovaps 288(%rdi), %ymm12
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2513,10 +2511,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-NEXT:    vmovaps %ymm0, %ymm15
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2524,9 +2520,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 384(%rdi), %xmm6
 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 288(%rdi), %ymm12
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 240(%rdi), %xmm11
 ; AVX2-NEXT:    vmovaps 192(%rdi), %xmm9
@@ -2717,8 +2717,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $488, %rsp # imm = 0x1E8
 ; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2727,10 +2725,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm15
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2738,9 +2734,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm6
 ; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
 ; AVX2-FP-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 240(%rdi), %xmm11
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm9
@@ -2931,8 +2931,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $488, %rsp # imm = 0x1E8
 ; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
@@ -2941,10 +2939,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0]
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2952,9 +2948,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm6
 ; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0]
 ; AVX2-FCP-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 240(%rdi), %xmm11
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm9
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 16647d0da63c5b..da66e846d4c953 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -9707,13 +9707,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -9726,6 +9730,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -9737,17 +9742,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -9793,11 +9799,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm29
 ; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10180,13 +10181,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -10199,6 +10204,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -10210,17 +10216,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -10266,11 +10273,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm16
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm9
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm18, %zmm29
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10653,13 +10655,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -10672,6 +10678,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -10683,17 +10690,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512DQ-BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -10739,11 +10747,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm9
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm29
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11126,13 +11129,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1152(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm12
@@ -11145,6 +11152,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm6
@@ -11156,17 +11164,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm3
@@ -11212,11 +11221,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1664(%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1600(%rdi), %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm18, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -17264,38 +17268,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 576(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm15
-; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -17303,10 +17286,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -17315,10 +17298,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 2944(%rdi), %zmm12
+; AVX512-NEXT:    vmovdqa64 2752(%rdi), %zmm18
+; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 2688(%rdi), %zmm7
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 2432(%rdi), %zmm17
+; AVX512-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 2368(%rdi), %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1984(%rdi), %zmm11
+; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa 912(%rdi), %xmm2
@@ -17351,9 +17355,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
 ; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
 ; AVX512-NEXT:    vmovdqa64 3072(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18263,38 +18266,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -18302,10 +18284,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -18314,10 +18296,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
@@ -18350,9 +18353,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
 ; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19262,38 +19264,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512DQ-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512DQ-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 576(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm15
-; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -19301,10 +19282,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -19313,10 +19294,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 2944(%rdi), %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 2752(%rdi), %zmm18
+; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 2688(%rdi), %zmm7
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 2432(%rdi), %zmm17
+; AVX512DQ-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 2368(%rdi), %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1984(%rdi), %zmm11
+; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 912(%rdi), %xmm2
@@ -19349,9 +19351,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512DQ-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 3072(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -20261,38 +20262,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1536(%rdi), %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
@@ -20300,10 +20280,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -20312,10 +20292,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1984(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 912(%rdi), %xmm2
@@ -20348,9 +20349,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21259,14 +21259,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21280,28 +21291,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -21347,9 +21347,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm2
@@ -22241,14 +22240,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -22262,28 +22272,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -22329,9 +22328,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
 ; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
@@ -23223,14 +23221,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 3008(%rdi), %zmm25
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 2944(%rdi), %zmm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2688(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2432(%rdi), %zmm19
-; AVX512DQ-BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -23244,28 +23253,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512DQ-BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -23311,9 +23309,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 3072(%rdi), %zmm2
@@ -24205,14 +24202,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3008(%rdi), %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2752(%rdi), %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2688(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2432(%rdi), %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2944(%rdi), %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -24226,28 +24234,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
@@ -24293,9 +24290,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 3072(%rdi), %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index 80f628099ee897..a9ee7c6c742177 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -4047,10 +4047,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4063,7 +4060,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -4075,16 +4071,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4098,20 +4093,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4119,23 +4109,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -4270,10 +4269,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4286,7 +4282,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -4298,16 +4293,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4321,20 +4315,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4342,23 +4331,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -4493,10 +4491,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512DQ-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4509,7 +4504,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -4521,16 +4515,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4544,20 +4537,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4565,23 +4553,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512DQ-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512DQ-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -4716,10 +4713,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4732,7 +4726,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -4744,16 +4737,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4767,20 +4759,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -4788,23 +4775,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -4939,10 +4935,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512BW-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -4955,7 +4948,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -4967,16 +4959,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -4990,20 +4981,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5011,23 +4997,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -5162,10 +5157,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5178,7 +5170,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -5190,16 +5181,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5213,20 +5203,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5234,23 +5219,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -5385,10 +5379,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512DQ-BW-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5401,7 +5392,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -5413,16 +5403,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5436,20 +5425,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5457,23 +5441,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512DQ-BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -5608,10 +5601,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm30
@@ -5624,7 +5614,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
@@ -5636,16 +5625,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm24
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm11
@@ -5659,20 +5647,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 512(%rdi), %ymm27
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm1
@@ -5680,23 +5663,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -8745,23 +8737,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    movb $-64, %al
-; AVX512-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -8772,6 +8753,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -8786,6 +8772,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -8853,10 +8845,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8986,6 +8978,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -8997,7 +8990,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -9266,23 +9258,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movb $-64, %al
-; AVX512-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -9293,6 +9274,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -9307,6 +9293,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -9374,10 +9366,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9507,6 +9499,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -9518,7 +9511,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -9787,23 +9779,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    movb $-64, %al
-; AVX512DQ-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -9814,6 +9795,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -9828,6 +9814,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512DQ-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -9895,10 +9887,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10028,6 +10020,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -10039,7 +10032,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -10308,23 +10300,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -10335,6 +10316,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -10349,6 +10335,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -10416,10 +10408,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10549,6 +10541,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -10560,7 +10553,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -10829,23 +10821,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -10856,6 +10837,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -10870,6 +10856,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -10937,10 +10929,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11070,6 +11062,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -11081,7 +11074,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -11350,23 +11342,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $-64, %al
-; AVX512BW-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -11377,6 +11358,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -11391,6 +11377,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -11458,10 +11450,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11591,6 +11583,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -11602,7 +11595,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -11871,23 +11863,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    movb $-64, %al
-; AVX512DQ-BW-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -11898,6 +11879,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -11912,6 +11898,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -11979,10 +11971,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -12112,6 +12104,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -12123,7 +12116,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -12389,26 +12381,15 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1280(%rdi), %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1472(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1408(%rdi), %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %ymm22
@@ -12419,6 +12400,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm20
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
@@ -12433,6 +12419,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
@@ -12500,10 +12492,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1088(%rdi), %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1024(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 1216(%rdi), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -12633,6 +12625,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
@@ -12644,7 +12637,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
@@ -18961,31 +18953,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    movb $-64, %al
-; AVX512-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -18995,11 +18972,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -19012,7 +18994,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -20032,31 +20024,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movb $-64, %al
-; AVX512-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -20066,11 +20043,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -20083,7 +20065,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -21103,31 +21095,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    movb $-64, %al
-; AVX512DQ-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -21137,11 +21114,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -21154,7 +21136,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512DQ-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -22174,31 +22166,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -22208,11 +22185,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -22225,7 +22207,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -23245,31 +23237,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -23279,11 +23256,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -23296,7 +23278,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -24316,31 +24308,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $-64, %al
-; AVX512BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -24350,11 +24327,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -24367,7 +24349,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -25387,31 +25379,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    movb $-64, %al
-; AVX512DQ-BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-BW-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -25421,11 +25398,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -25438,7 +25420,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, %zmm1
@@ -26458,31 +26450,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 832(%rdi), %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %al
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 3200(%rdi), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
@@ -26492,11 +26469,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 960(%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -26509,7 +26491,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index 429758c8350659..a03a0536c78cb3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -1038,7 +1038,6 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm8
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, %xmm7
 ; SSE-NEXT:    pandn %xmm4, %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
@@ -1052,8 +1051,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    por %xmm7, %xmm0
 ; SSE-NEXT:    pxor %xmm9, %xmm9
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    pandn %xmm1, %xmm3
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
@@ -1068,16 +1067,13 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    pand %xmm7, %xmm0
 ; SSE-NEXT:    movdqa %xmm8, %xmm3
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
-; SSE-NEXT:    movdqa 48(%rdi), %xmm12
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
 ; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm15, %xmm10
 ; SSE-NEXT:    por %xmm1, %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0]
@@ -1087,6 +1083,10 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm10
 ; SSE-NEXT:    pandn %xmm1, %xmm10
+; SSE-NEXT:    movdqa 48(%rdi), %xmm12
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    por %xmm0, %xmm10
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 60fcf25b507b7b..1cd705b12a57e3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -1125,8 +1125,9 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm6
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm15
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm8, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
@@ -1134,7 +1135,6 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    packuswb %xmm0, %xmm7
 ; SSE-NEXT:    packuswb %xmm1, %xmm7
-; SSE-NEXT:    movdqa %xmm2, %xmm15
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
@@ -1960,7 +1960,6 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $600, %rsp # imm = 0x258
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm15
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm7
@@ -1977,8 +1976,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm5
 ; SSE-NEXT:    pand %xmm9, %xmm1
+; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
-; SSE-NEXT:    packuswb %xmm13, %xmm1
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm2
 ; SSE-NEXT:    pand %xmm9, %xmm2
@@ -1988,21 +1987,22 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    pand %xmm9, %xmm1
+; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pand %xmm9, %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    movdqa 16(%rdi), %xmm8
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm8, %xmm0
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa (%rdi), %xmm11
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index c215e2dd9f4d9d..c5f1742538c12d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2126,10 +2126,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa 48(%rdi), %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa 48(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
@@ -4084,12 +4084,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    movdqa 208(%rdi), %xmm4
 ; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa 208(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm14
@@ -4791,10 +4791,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm8, %xmm15
 ; SSE-NEXT:    pand %xmm8, %xmm13
 ; SSE-NEXT:    pand %xmm8, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index c7b73198c7f4d7..a696b0b2906884 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -2587,8 +2587,10 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm10, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm10, %xmm4
-; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    pandn %xmm14, %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pandn %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
@@ -2674,8 +2676,6 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm5
-; SSE-NEXT:    pandn %xmm14, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm12
 ; SSE-NEXT:    movdqa %xmm11, %xmm4
 ; SSE-NEXT:    pandn %xmm9, %xmm4
@@ -4662,9 +4662,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
@@ -4673,11 +4670,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa (%rdi), %xmm7
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4881,7 +4881,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm6
-; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm7, %xmm2
 ; SSE-NEXT:    movdqa %xmm7, %xmm8
 ; SSE-NEXT:    pandn %xmm6, %xmm8
@@ -4892,6 +4891,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm13, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 48ef75742eccb1..d38fc38cb104f4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1087,6 +1087,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    movdqa %xmm4, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    pandn %xmm10, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
@@ -1096,7 +1097,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
@@ -1894,10 +1894,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm3, %xmm9
 ; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    movdqa %xmm6, %xmm11
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm10, %xmm3
 ; SSE-NEXT:    por %xmm0, %xmm3
-; SSE-NEXT:    movdqa 96(%rdi), %xmm15
 ; SSE-NEXT:    movdqa %xmm3, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535]
@@ -1917,11 +1915,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    pandn %xmm7, %xmm1
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm5
 ; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm13
 ; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm6, %xmm6
@@ -1931,8 +1926,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm15, %xmm2
-; SSE-NEXT:    movdqa %xmm15, %xmm3
+; SSE-NEXT:    movdqa 96(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm13
+; SSE-NEXT:    movdqa %xmm2, %xmm3
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
@@ -3620,16 +3619,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    subq $648, %rsp # imm = 0x288
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm4
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm6
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm3
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    movdqa 160(%rdi), %xmm7
 ; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa 160(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm9
@@ -3646,7 +3643,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa 192(%rdi), %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
@@ -3665,7 +3661,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm4
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
 ; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    movdqa %xmm7, %xmm15
 ; SSE-NEXT:    por %xmm4, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
@@ -3676,6 +3671,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
 ; SSE-NEXT:    pandn %xmm2, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa 192(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm7, %xmm15
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm7, %xmm2
 ; SSE-NEXT:    pandn %xmm6, %xmm2
@@ -3942,11 +3941,11 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    pandn %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm12, %xmm7
 ; SSE-NEXT:    movdqa %xmm12, %xmm5
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm1, %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3]
-; SSE-NEXT:    movdqa %xmm12, %xmm7
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
@@ -7221,35 +7220,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm3
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm1
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    movdqa 160(%rdi), %xmm6
 ; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa 160(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pxor %xmm6, %xmm6
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm2, %xmm7
 ; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm6
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE-NEXT:    movdqa 112(%rdi), %xmm4
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535]
+; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    movdqa 112(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535]
-; SSE-NEXT:    movdqa %xmm11, %xmm1
-; SSE-NEXT:    pandn %xmm3, %xmm1
 ; SSE-NEXT:    movdqa %xmm4, %xmm3
 ; SSE-NEXT:    pand %xmm11, %xmm3
 ; SSE-NEXT:    por %xmm1, %xmm3
@@ -7855,13 +7854,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm9, %xmm4
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm12, %xmm1
 ; SSE-NEXT:    movdqa %xmm12, %xmm2
+; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm4
 ; SSE-NEXT:    pandn %xmm9, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm15, %xmm4
 ; SSE-NEXT:    por %xmm0, %xmm4
@@ -9479,12 +9478,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
 ; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm14
 ; AVX-NEXT:    vpor %xmm5, %xmm14, %xmm5
+; AVX-NEXT:    vmovdqa %xmm12, %xmm14
 ; AVX-NEXT:    vpblendvb %xmm12, %xmm2, %xmm5, %xmm2
+; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm5
 ; AVX-NEXT:    vmovdqa %xmm13, %xmm9
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
-; AVX-NEXT:    vmovdqa %xmm12, %xmm14
-; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpshufb %xmm13, %xmm13, %xmm5
 ; AVX-NEXT:    vmovdqa %xmm1, %xmm12
 ; AVX-NEXT:    vpor %xmm5, %xmm2, %xmm1
 ; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
@@ -9936,8 +9935,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    # xmm9 = mem[0,0]
 ; AVX-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
 ; AVX-NEXT:    vpor %xmm7, %xmm10, %xmm7
-; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
 ; AVX-NEXT:    vmovdqa %xmm0, %xmm11
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX-NEXT:    vandps %ymm0, %ymm3, %ymm3
@@ -10148,22 +10147,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $760, %rsp # imm = 0x2F8
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm6
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm1, %ymm12
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
@@ -10171,9 +10166,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
 ; AVX2-NEXT:    vmovdqa %ymm9, %ymm14
 ; AVX2-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm4, %ymm11
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
@@ -10182,6 +10175,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm7, %ymm0
@@ -10687,22 +10686,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $760, %rsp # imm = 0x2F8
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
 ; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm12
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpor %xmm3, %xmm0, %xmm0
@@ -10710,9 +10705,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
 ; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm14
 ; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm11
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
 ; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
@@ -10721,6 +10714,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
 ; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
 ; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm0
@@ -11226,30 +11225,23 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $776, %rsp # imm = 0x308
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm12, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm7
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm13
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm11
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
 ; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
@@ -11258,6 +11250,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm6, %ymm10, %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm7
@@ -11267,9 +11266,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm0
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm15, %ymm6, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm0, %ymm6, %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm5
@@ -13521,6 +13519,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -13573,7 +13572,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -13894,6 +13892,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm22
+; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
 ; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm23
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -13913,7 +13912,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm20, %xmm15
@@ -14138,13 +14136,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
 ; AVX512BW-FCP-NEXT:    vpermw %zmm26, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm7
+; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
@@ -14254,6 +14252,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
 ; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -14306,7 +14305,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -14624,6 +14622,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
@@ -14643,7 +14642,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm20, %xmm15
@@ -14868,13 +14866,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm26, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 72be7f0399fd52..75d05387403859 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -3793,10 +3793,6 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $904, %rsp # imm = 0x388
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm6
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 80(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 144(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 160(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm14
@@ -3810,8 +3806,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm2
@@ -3822,8 +3818,6 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    movdqa 96(%rdi), %xmm12
-; SSE-NEXT:    movdqa 64(%rdi), %xmm5
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pand %xmm4, %xmm0
@@ -3831,11 +3825,17 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm3, %xmm3
-; SSE-NEXT:    movdqa 112(%rdi), %xmm15
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
+; SSE-NEXT:    movdqa 64(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 80(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm12
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 112(%rdi), %xmm15
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
 ; SSE-NEXT:    pand %xmm4, %xmm0
@@ -8620,16 +8620,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm9
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 240(%rdi), %xmm12
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0]
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm9, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    movdqa 208(%rdi), %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    movdqa 208(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pand %xmm4, %xmm1
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
@@ -8648,10 +8648,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm4, %xmm3
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    packuswb %xmm3, %xmm3
-; SSE-NEXT:    movdqa 112(%rdi), %xmm14
 ; SSE-NEXT:    packuswb %xmm0, %xmm3
+; SSE-NEXT:    movdqa 112(%rdi), %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
@@ -14311,7 +14311,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
@@ -14324,6 +14323,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
 ; AVX512-NEXT:    vmovdqa64 %xmm13, %xmm30
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm13
@@ -14837,6 +14837,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -14854,8 +14855,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -14866,10 +14867,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm20
@@ -15626,7 +15626,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
@@ -15639,6 +15638,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm13, %xmm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm13
@@ -16152,6 +16152,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -16169,8 +16170,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -16181,10 +16182,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm16
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm20
@@ -16459,34 +16459,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm4
 ; AVX512BW-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vmovdqa 448(%rdi), %xmm7
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm7, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX512BW-NEXT:    vmovdqa64 %xmm7, %xmm30
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX512BW-NEXT:    vpmovqb %ymm4, %xmm4
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-NEXT:    vmovdqa 368(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovaps 368(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX512BW-NEXT:    vmovdqa 336(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm11
 ; AVX512BW-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm15
-; AVX512BW-NEXT:    vmovdqa %xmm2, %xmm9
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa %xmm2, %xmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
 ; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
@@ -17324,19 +17324,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
 ; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm18, %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm11
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
@@ -17586,34 +17586,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm6, %xmm26
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-BW-NEXT:    vmovdqa 448(%rdi), %xmm7
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm7, %xmm30
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX512DQ-BW-NEXT:    vpmovqb %ymm4, %xmm4
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa 368(%rdi), %xmm2
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512DQ-BW-NEXT:    vmovaps 368(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
-; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX512DQ-BW-NEXT:    vmovdqa 336(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa 320(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm2, %xmm15
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, %xmm9
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %xmm27
+; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm27, %xmm6
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, %xmm9
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, %xmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm15
@@ -18451,19 +18451,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm28, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %xmm1, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm18, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 9fd7862fdc368f..e7a6bfc48760e8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1163,13 +1163,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-LABEL: store_i16_stride3_vf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE-NEXT:    movdqa 32(%rdi), %xmm4
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rsi), %xmm7
-; SSE-NEXT:    movdqa 32(%rsi), %xmm8
 ; SSE-NEXT:    movdqa 48(%rsi), %xmm11
 ; SSE-NEXT:    movdqa 48(%rdx), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
 ; SSE-NEXT:    movdqa %xmm0, %xmm9
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1183,11 +1179,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    pand %xmm2, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    movdqa 32(%rdx), %xmm10
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 32(%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rsi), %xmm7
+; SSE-NEXT:    movdqa 32(%rsi), %xmm8
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm5, %xmm3
 ; SSE-NEXT:    pandn %xmm1, %xmm3
@@ -1198,19 +1196,19 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5]
 ; SSE-NEXT:    pand %xmm5, %xmm9
 ; SSE-NEXT:    por %xmm3, %xmm9
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2]
-; SSE-NEXT:    pand %xmm5, %xmm1
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    movdqa %xmm5, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
+; SSE-NEXT:    movdqa 32(%rdx), %xmm3
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm1
 ; SSE-NEXT:    por %xmm1, %xmm4
 ; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    movdqa %xmm10, %xmm3
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2]
 ; SSE-NEXT:    movdqa %xmm2, %xmm10
 ; SSE-NEXT:    pandn %xmm1, %xmm10
 ; SSE-NEXT:    por %xmm4, %xmm10
@@ -2829,7 +2827,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vmovdqa 80(%rsi), %xmm5
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
@@ -2850,6 +2847,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm7
 ; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm7, %ymm3
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm5
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX2-FP-NEXT:    vmovdqa 48(%rsi), %xmm8
@@ -2860,8 +2858,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm8
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm15, %ymm7, %ymm7
-; AVX2-FP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm10, %ymm15
+; AVX2-FP-NEXT:    vpermd %ymm5, %ymm10, %ymm15
 ; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm7, %ymm15, %ymm7
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm15
 ; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm0
@@ -2973,7 +2970,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rsi), %xmm5
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
@@ -2994,6 +2990,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm7
 ; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm7, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vmovdqa 48(%rsi), %xmm8
@@ -3004,8 +3001,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm8
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm15
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm10, %ymm15
 ; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm7, %ymm15, %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm15
 ; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm0
@@ -3108,17 +3104,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
 ; AVX512-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
@@ -3136,8 +3130,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3152,7 +3146,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3163,12 +3156,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3200,6 +3191,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpternlogq $248, %zmm15, %zmm0, %zmm5
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
 ; AVX512-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
 ; AVX512-NEXT:    vpor %ymm0, %ymm6, %ymm0
@@ -3218,20 +3212,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
 ; AVX512-NEXT:    vpternlogq $248, %zmm19, %zmm0, %zmm6
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
+; AVX512-NEXT:    vprold $16, %xmm2, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
+; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm4
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
+; AVX512-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
@@ -3260,17 +3253,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
 ; AVX512-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm14
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
 ; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
@@ -3288,8 +3279,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3304,7 +3295,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3315,12 +3305,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512-FCP-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3352,6 +3340,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpternlogq $248, %zmm15, %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
@@ -3370,20 +3361,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
 ; AVX512-FCP-NEXT:    vpternlogq $248, %zmm19, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm4
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
+; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
@@ -3412,17 +3402,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
 ; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm14
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
 ; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
@@ -3440,8 +3428,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512DQ-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3456,7 +3444,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3467,12 +3454,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3504,6 +3489,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpternlogq $248, %zmm15, %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
 ; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm6, %ymm0
@@ -3522,20 +3510,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
 ; AVX512DQ-NEXT:    vpternlogq $248, %zmm19, %zmm0, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
+; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm4
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
+; AVX512DQ-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
@@ -3564,17 +3551,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
@@ -3592,8 +3577,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm10
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rsi), %xmm13
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
@@ -3608,7 +3593,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm5
@@ -3619,12 +3603,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 112(%rsi), %xmm12
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm20
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm21
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm5
@@ -3656,6 +3638,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm15, %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm6
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
@@ -3674,20 +3659,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm7
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
 ; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm19, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm24, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rsi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index a52d6cc9bd3b71..51d0ba916f2af5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -3083,20 +3083,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512-NEXT:    vmovdqa 16(%rcx), %xmm5
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT:    vmovdqa 48(%rcx), %xmm9
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512-NEXT:    vmovdqa 16(%rdx), %xmm8
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm14
-; AVX512-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19]
@@ -3161,13 +3157,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
-; AVX512-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
-; AVX512-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3192,16 +3186,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512-NEXT:    vmovdqa 48(%rcx), %xmm9
+; AVX512-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512-NEXT:    vpermt2d %zmm2, %zmm5, %zmm6
 ; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm9
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
@@ -3237,20 +3235,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rcx), %xmm5
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa 48(%rcx), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdx), %xmm8
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm5, %zmm8
@@ -3315,13 +3309,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3346,16 +3338,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 48(%rcx), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm6, %zmm2
 ; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm9
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
@@ -3391,20 +3387,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa 16(%rcx), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-NEXT:    vmovdqa 48(%rcx), %xmm9
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdx), %xmm8
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm14
-; AVX512DQ-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19]
@@ -3469,13 +3461,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3500,16 +3490,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 48(%rcx), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm5, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm6, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm9
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
@@ -3545,20 +3539,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rcx), %xmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rcx), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdx), %xmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm5, %zmm8
@@ -3623,13 +3613,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm17, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %xmm19
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %xmm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rcx), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdx), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3654,16 +3642,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rcx), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdx), %xmm10
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm6, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
 ; AVX512DQ-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 994c785126d256..3b160cdef312b9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -1158,13 +1158,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-LABEL: store_i16_stride5_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm5
-; SSE-NEXT:    movdqa (%rsi), %xmm8
-; SSE-NEXT:    movdqa (%rcx), %xmm14
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm0
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm10
-; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm11
-; SSE-NEXT:    movdqa 16(%r8), %xmm3
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pandn %xmm5, %xmm2
@@ -1178,7 +1174,6 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0]
 ; SSE-NEXT:    pand %xmm9, %xmm4
-; SSE-NEXT:    movdqa (%rdi), %xmm15
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    movdqa %xmm9, %xmm13
@@ -1186,6 +1181,11 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    por %xmm4, %xmm13
 ; SSE-NEXT:    pand %xmm12, %xmm13
 ; SSE-NEXT:    por %xmm6, %xmm13
+; SSE-NEXT:    movdqa (%rdi), %xmm15
+; SSE-NEXT:    movdqa (%rsi), %xmm8
+; SSE-NEXT:    movdqa (%rcx), %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r8), %xmm3
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; SSE-NEXT:    pand %xmm2, %xmm13
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
@@ -2489,10 +2489,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm6, %xmm11
+; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7]
-; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
@@ -2837,10 +2837,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $72, %rsp
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX2-NEXT:    vmovdqa (%r8), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm8
 ; AVX2-NEXT:    vmovdqa (%rcx), %xmm7
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm9
@@ -2851,19 +2847,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm10
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm13
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-NEXT:    vpbroadcastq (%r8), %ymm11
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm11, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
@@ -2873,13 +2865,24 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
+; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
+; AVX2-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa (%r8), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpbroadcastq 32(%r8), %ymm5
 ; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-NEXT:    vpshufb %xmm11, %xmm12, %xmm0
 ; AVX2-NEXT:    vpbroadcastq 8(%rdi), %xmm12
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
@@ -2898,15 +2901,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpbroadcastq 40(%rdi), %xmm11
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
 ; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm10
-; AVX2-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm11
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1]
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm5
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm13, %ymm14, %ymm9, %ymm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7]
@@ -2918,10 +2916,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpblendvb %ymm14, %ymm0, %ymm13, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1]
 ; AVX2-NEXT:    vpblendvb %ymm12, %ymm9, %ymm13, %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
 ; AVX2-NEXT:    vmovdqa (%rcx), %ymm13
@@ -3622,7 +3621,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm5
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
@@ -3641,8 +3639,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
@@ -3936,7 +3934,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm22
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15]
@@ -3955,8 +3952,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
@@ -4583,13 +4580,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm12, %xmm11
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    movdqa %xmm14, %xmm12
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
 ; SSE-NEXT:    pand %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm14, %xmm12
 ; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm3, %xmm7
 ; SSE-NEXT:    movdqa %xmm3, %xmm14
@@ -4978,10 +4975,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm13
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index b2a270adbb359e..c42bb48d39f157 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -1712,23 +1712,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $24, %rsp
 ; AVX2-FP-NEXT:    vmovaps (%r9), %ymm3
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm10
-; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm8
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm9
 ; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm11
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX2-FP-NEXT:    vmovdqa %xmm11, %xmm4
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
@@ -1740,9 +1734,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm9
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm0, %ymm9
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm10
+; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm8
+; AVX2-FP-NEXT:    vmovdqa %xmm11, %xmm4
 ; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm11
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3]
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
@@ -2038,15 +2037,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
-; AVX512-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
-; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
 ; AVX512-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm11
@@ -2056,10 +2052,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
 ; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
 ; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
@@ -2068,7 +2068,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
@@ -2081,7 +2080,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
@@ -2095,7 +2093,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[1,2,2,3,5,6,6,7]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
 ; AVX512-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
@@ -2205,15 +2203,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
-; AVX512DQ-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm11
@@ -2223,10 +2218,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
@@ -2235,7 +2234,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
@@ -2248,7 +2246,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
@@ -2262,7 +2259,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[1,2,2,3,5,6,6,7]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
 ; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
@@ -2419,21 +2416,20 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm10
 ; SSE-NEXT:    movdqa (%r8), %xmm8
 ; SSE-NEXT:    movdqa %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    movdqa (%r9), %xmm11
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
 ; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1]
+; SSE-NEXT:    movdqa (%r9), %xmm7
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0,1,3]
 ; SSE-NEXT:    movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    andps %xmm6, %xmm9
-; SSE-NEXT:    movdqa %xmm11, %xmm7
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
 ; SSE-NEXT:    movaps %xmm6, %xmm0
 ; SSE-NEXT:    andnps %xmm11, %xmm0
@@ -5189,21 +5185,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa 16(%rcx), %xmm3
 ; SSE-NEXT:    movdqa (%r8), %xmm9
 ; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; SSE-NEXT:    movdqa %xmm10, %xmm7
-; SSE-NEXT:    movdqa (%r9), %xmm8
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1]
+; SSE-NEXT:    movdqa (%r9), %xmm5
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0,1,3]
 ; SSE-NEXT:    movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    andps %xmm14, %xmm7
-; SSE-NEXT:    movdqa %xmm8, %xmm5
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
 ; SSE-NEXT:    movaps %xmm14, %xmm0
 ; SSE-NEXT:    andnps %xmm8, %xmm0
@@ -6623,13 +6618,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-NEXT:    vmovdqa (%r8), %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT:    vmovdqa (%r8), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
 ; AVX2-NEXT:    vmovdqa (%r9), %xmm0
@@ -6641,6 +6633,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
+; AVX2-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
@@ -9245,27 +9240,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1]
@@ -9276,18 +9266,23 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm31
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm31
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm7
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 1d1c4de793b6d6..0dc07bf4e39da6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -3565,34 +3565,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa 48(%rsi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rcx), %xmm5
 ; SSE-NEXT:    movdqa 48(%r8), %xmm9
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
-; SSE-NEXT:    movaps 48(%rax), %xmm7
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm1, %xmm10
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm5, %xmm11
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm6, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 48(%r9), %xmm4
 ; SSE-NEXT:    movdqa %xmm3, %xmm5
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
 ; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm2, %xmm3
 ; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa 48(%r9), %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 48(%rax), %xmm7
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -4799,27 +4799,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm9
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
-; AVX2-NEXT:    vmovdqa 32(%r9), %ymm7
 ; AVX2-NEXT:    vpermd %ymm8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-NEXT:    vmovdqa 32(%r8), %ymm6
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0]
 ; AVX2-NEXT:    vpermd %ymm5, %ymm2, %ymm4
+; AVX2-NEXT:    vpermd %ymm12, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX2-NEXT:    vmovdqa 32(%r9), %ymm7
 ; AVX2-NEXT:    vmovdqa %ymm5, %ymm11
 ; AVX2-NEXT:    vpermd %ymm13, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpermd %ymm12, %ymm2, %ymm2
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
@@ -7721,46 +7721,47 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 112(%rdx), %xmm1
 ; SSE-NEXT:    movdqa 112(%rcx), %xmm6
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
 ; SSE-NEXT:    movdqa 112(%r8), %xmm4
-; SSE-NEXT:    movaps 112(%rax), %xmm7
-; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm13, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm13, %xmm0
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa 96(%rcx), %xmm12
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE-NEXT:    movdqa 112(%r9), %xmm8
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa 96(%rdx), %xmm5
 ; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm4, %xmm9
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm2, %xmm3
 ; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa 112(%r9), %xmm8
+; SSE-NEXT:    movdqa %xmm4, %xmm9
 ; SSE-NEXT:    movdqa %xmm8, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm15[0,2]
+; SSE-NEXT:    movdqa 96(%rdx), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 112(%rax), %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2]
 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; SSE-NEXT:    andps %xmm1, %xmm4
 ; SSE-NEXT:    andnps %xmm7, %xmm1
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    orps %xmm4, %xmm1
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rcx), %xmm12
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm12, %xmm1
 ; SSE-NEXT:    psrlq $48, %xmm1
 ; SSE-NEXT:    movdqa %xmm5, %xmm3
@@ -8134,15 +8135,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
 ; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    por %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm0, %xmm6
@@ -8846,6 +8847,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm10, %xmm13
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1]
 ; SSE-NEXT:    movaps %xmm5, %xmm2
 ; SSE-NEXT:    andnps %xmm1, %xmm2
@@ -8882,7 +8884,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE-NEXT:    movaps %xmm10, %xmm13
 ; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    psrld $16, %xmm1
@@ -9022,7 +9023,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm13, %xmm7
+; SSE-NEXT:    movdqa %xmm13, %xmm7
 ; SSE-NEXT:    shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm7 = xmm7[1],mem[0]
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -15366,8 +15367,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
+; AVX512BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -15592,8 +15593,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -15818,8 +15819,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512DQ-BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
@@ -16044,8 +16045,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm7 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 64f5761b31d64f..aa541711601b18 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -2261,11 +2261,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512-NEXT:    vmovdqa (%r8), %ymm9
-; AVX512-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-NEXT:    vmovdqa (%r10), %ymm12
-; AVX512-NEXT:    vmovdqa (%rax), %ymm13
 ; AVX512-NEXT:    vmovdqa (%rax), %xmm0
 ; AVX512-NEXT:    vmovdqa (%r10), %xmm2
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -2274,10 +2269,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX512-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
 ; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm14
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512-NEXT:    vmovdqa (%r10), %ymm12
+; AVX512-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
@@ -2298,7 +2298,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
 ; AVX512-NEXT:    vpermt2d %zmm3, %zmm19, %zmm14
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
@@ -2332,11 +2331,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm9
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm13
 ; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -2345,10 +2339,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
@@ -2369,7 +2368,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm14
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
@@ -2403,11 +2401,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm9
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm12
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm13
 ; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm2
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -2416,10 +2409,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm14
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm12
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
@@ -2440,7 +2438,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
 ; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm19, %zmm14
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
@@ -2474,11 +2471,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -2487,10 +2479,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
@@ -2511,7 +2508,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
index 22508e2ccfc79e..5017bd19a3e9fd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
@@ -1322,24 +1322,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-LABEL: store_i32_stride2_vf64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-NEXT:    vmovaps 96(%rdi), %ymm9
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm8
+; AVX2-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm6
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-NEXT:    vmovaps 128(%rsi), %ymm12
-; AVX2-NEXT:    vmovaps (%rsi), %ymm2
 ; AVX2-NEXT:    vmovaps 32(%rsi), %ymm5
 ; AVX2-NEXT:    vmovaps 64(%rsi), %ymm13
 ; AVX2-NEXT:    vmovaps 96(%rsi), %ymm14
-; AVX2-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1]
 ; AVX2-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
 ; AVX2-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
@@ -1366,8 +1366,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1]
 ; AVX2-NEXT:    vmovaps 224(%rsi), %ymm15
-; AVX2-NEXT:    vmovaps %ymm0, %ymm1
-; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7]
+; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
 ; AVX2-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1]
@@ -1394,24 +1393,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-FP-LABEL: store_i32_stride2_vf64:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-FP-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-FP-NEXT:    vmovaps 128(%rsi), %ymm12
-; AVX2-FP-NEXT:    vmovaps (%rsi), %ymm2
 ; AVX2-FP-NEXT:    vmovaps 32(%rsi), %ymm5
 ; AVX2-FP-NEXT:    vmovaps 64(%rsi), %ymm13
 ; AVX2-FP-NEXT:    vmovaps 96(%rsi), %ymm14
-; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1]
 ; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
 ; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
@@ -1438,8 +1437,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3]
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1]
 ; AVX2-FP-NEXT:    vmovaps 224(%rsi), %ymm15
-; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm1
-; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7]
+; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
 ; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3]
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1]
@@ -1466,24 +1464,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-FCP-LABEL: store_i32_stride2_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovaps 192(%rsi), %ymm10
 ; AVX2-FCP-NEXT:    vmovaps 160(%rsi), %ymm11
 ; AVX2-FCP-NEXT:    vmovaps 128(%rsi), %ymm12
-; AVX2-FCP-NEXT:    vmovaps (%rsi), %ymm2
 ; AVX2-FCP-NEXT:    vmovaps 32(%rsi), %ymm5
 ; AVX2-FCP-NEXT:    vmovaps 64(%rsi), %ymm13
 ; AVX2-FCP-NEXT:    vmovaps 96(%rsi), %ymm14
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1]
 ; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
@@ -1510,8 +1508,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3]
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1]
 ; AVX2-FCP-NEXT:    vmovaps 224(%rsi), %ymm15
-; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7]
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3]
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 7d636b2d8aa3b7..62706b8226ccf3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -715,18 +715,18 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps 32(%rsi), %xmm10
 ; SSE-NEXT:    movaps 48(%rsi), %xmm11
 ; SSE-NEXT:    movaps 48(%rdx), %xmm8
-; SSE-NEXT:    movaps 32(%rdx), %xmm3
 ; SSE-NEXT:    movaps %xmm5, %xmm12
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1]
 ; SSE-NEXT:    movaps %xmm5, %xmm13
 ; SSE-NEXT:    movaps %xmm5, %xmm6
-; SSE-NEXT:    movaps 16(%rdx), %xmm0
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3]
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1]
-; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2]
 ; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3]
+; SSE-NEXT:    movaps 16(%rdx), %xmm8
+; SSE-NEXT:    movaps 32(%rdx), %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2]
 ; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0]
 ; SSE-NEXT:    movaps %xmm4, %xmm13
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1]
@@ -745,10 +745,9 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps %xmm2, %xmm13
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3]
-; SSE-NEXT:    movaps %xmm0, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,1],xmm8[0,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0]
 ; SSE-NEXT:    movaps %xmm1, %xmm14
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index ede8586545e496..42a15b1d90e535 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -1335,8 +1335,6 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-LABEL: store_i32_stride5_vf16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps (%rsi), %xmm6
-; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX-NEXT:    vmovaps 32(%rsi), %ymm3
 ; AVX-NEXT:    vmovaps 32(%rsi), %xmm8
 ; AVX-NEXT:    vmovaps (%rdi), %xmm10
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm9
@@ -1344,12 +1342,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
 ; AVX-NEXT:    vmovaps (%rdx), %xmm11
-; AVX-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0]
 ; AVX-NEXT:    vmovaps (%rcx), %xmm12
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0]
 ; AVX-NEXT:    vbroadcastss 4(%rdx), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm7, %ymm7
-; AVX-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7]
 ; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm5, %ymm5
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7]
@@ -1364,10 +1361,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = xmm9[0],xmm8[0],zero,zero
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm15, %ymm5
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7]
-; AVX-NEXT:    vmovaps 32(%rcx), %ymm7
 ; AVX-NEXT:    vinsertf128 $1, 32(%r8), %ymm15, %ymm15
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7]
-; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX-NEXT:    vmovaps (%rdi), %ymm5
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 32(%rsi), %ymm3
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX-NEXT:    vbroadcastss 4(%rcx), %xmm12
@@ -1377,6 +1375,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
 ; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = mem[0,1,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7]
+; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX-NEXT:    vmovaps 32(%rcx), %ymm7
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 48(%rsi), %xmm10
 ; AVX-NEXT:    vmovaps 48(%rdi), %xmm11
@@ -1415,8 +1415,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
 ; AVX-NEXT:    vmovaps 16(%rsi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm9, %ymm9
-; AVX-NEXT:    vmovaps %ymm0, %ymm5
-; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7]
+; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm5[3,3],ymm15[7,7],ymm5[7,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7]
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm14
@@ -4775,10 +4774,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps (%rcx), %xmm5
 ; SSE-NEXT:    movaps 16(%rcx), %xmm9
 ; SSE-NEXT:    movaps (%r8), %xmm4
-; SSE-NEXT:    movaps 32(%r8), %xmm14
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps 16(%r8), %xmm11
-; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
@@ -4788,15 +4784,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3]
-; SSE-NEXT:    movaps 32(%rcx), %xmm12
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE-NEXT:    movaps 32(%rcx), %xmm12
+; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 32(%r8), %xmm14
+; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm12, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -5311,14 +5310,14 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    movaps 240(%rdi), %xmm5
-; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm5, %xmm6
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1]
-; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm3, %xmm15
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 31d7791d674a44..64aa6ddb853171 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -10091,46 +10091,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
 ; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
-; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -10147,21 +10135,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
+; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
 ; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10659,46 +10659,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
 ; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
-; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -10715,21 +10703,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
+; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11227,46 +11227,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
 ; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
-; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -11283,21 +11271,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
+; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -11795,46 +11795,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
-; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm27
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
@@ -11851,21 +11839,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
+; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 275f36005f1ee0..af60bf28554b04 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -1780,43 +1780,42 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    subq $520, %rsp # imm = 0x208
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rdx), %xmm5
-; SSE-NEXT:    movaps (%rcx), %xmm8
-; SSE-NEXT:    movaps 16(%r8), %xmm14
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rdx), %xmm9
+; SSE-NEXT:    movaps (%rcx), %xmm8
 ; SSE-NEXT:    movaps (%r8), %xmm15
-; SSE-NEXT:    movdqa (%r9), %xmm13
-; SSE-NEXT:    movaps 16(%rcx), %xmm2
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%r9), %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rax), %xmm11
-; SSE-NEXT:    movdqa 16(%rsi), %xmm6
 ; SSE-NEXT:    movaps %xmm15, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
 ; SSE-NEXT:    movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
+; SSE-NEXT:    movdqa (%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rdx), %xmm9
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%r9), %xmm13
+; SSE-NEXT:    movdqa (%rax), %xmm11
 ; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm10, %xmm0
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE-NEXT:    movdqa %xmm4, %xmm12
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm6, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
+; SSE-NEXT:    movdqa 16(%rsi), %xmm9
+; SSE-NEXT:    movaps 16(%rcx), %xmm2
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps 16(%r8), %xmm14
+; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r9), %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm12
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm14, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -1828,8 +1827,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm1
@@ -1918,12 +1917,12 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
 ; SSE-NEXT:    movdqa %xmm4, %xmm13
-; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3]
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm12, %xmm11
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -4016,39 +4015,40 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa (%rdi), %xmm8
 ; SSE-NEXT:    movdqa (%rsi), %xmm10
 ; SSE-NEXT:    movaps (%rdx), %xmm14
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rcx), %xmm13
-; SSE-NEXT:    movdqa 16(%rdx), %xmm7
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 16(%rcx), %xmm9
 ; SSE-NEXT:    movaps (%r8), %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%r9), %xmm15
-; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%r9), %xmm12
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rsi), %xmm4
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rax), %xmm2
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1]
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
 ; SSE-NEXT:    movaps %xmm14, %xmm3
 ; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
 ; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+; SSE-NEXT:    movdqa %xmm8, %xmm0
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 16(%r8), %xmm11
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rsi), %xmm3
+; SSE-NEXT:    movdqa 16(%rdx), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%rcx), %xmm9
+; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps 16(%r8), %xmm11
+; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r9), %xmm12
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm11, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -4060,15 +4060,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm15
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 32(%rdx), %xmm5
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 32(%rdx), %xmm5
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm5, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 32(%rcx), %xmm5
@@ -4119,10 +4119,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 64(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 64(%rdx), %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm12
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 64(%rdx), %xmm3
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 64(%rcx), %xmm3
@@ -6797,8 +6797,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512-NEXT:    kmovw %ecx, %k2
 ; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7000,8 +7000,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512-FCP-NEXT:    kmovw %ecx, %k2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7203,8 +7203,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512DQ-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7406,8 +7406,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-FCP-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7609,8 +7609,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -7812,8 +7812,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512BW-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8015,8 +8015,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-BW-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8218,8 +8218,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movw $14448, %cx # imm = 0x3870
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm7 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm5
@@ -8381,43 +8381,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    subq $2760, %rsp # imm = 0xAC8
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
-; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rdx), %xmm2
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%rdx), %xmm7
-; SSE-NEXT:    movdqa 16(%rsi), %xmm3
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps (%rcx), %xmm13
-; SSE-NEXT:    movaps 16(%rcx), %xmm9
-; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rdi), %xmm6
 ; SSE-NEXT:    movaps (%r8), %xmm0
-; SSE-NEXT:    movaps 16(%r8), %xmm10
-; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%r9), %xmm12
-; SSE-NEXT:    movdqa 16(%r9), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rax), %xmm15
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
 ; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
+; SSE-NEXT:    movdqa (%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%rsi), %xmm3
+; SSE-NEXT:    movdqa 16(%rdx), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 16(%rcx), %xmm9
+; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%r9), %xmm12
+; SSE-NEXT:    movdqa (%rax), %xmm15
 ; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movaps 16(%r8), %xmm10
+; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r9), %xmm8
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm10, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -8434,10 +8434,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 32(%rdx), %xmm3
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 32(%rdx), %xmm3
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 32(%rcx), %xmm3
@@ -8487,10 +8487,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 64(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 64(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 64(%rdx), %xmm4
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 64(%rcx), %xmm4
@@ -8540,10 +8540,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 96(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 96(%rdx), %xmm4
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 96(%rcx), %xmm10
@@ -8592,10 +8592,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 128(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 128(%rdx), %xmm4
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 128(%rcx), %xmm4
@@ -8645,10 +8645,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 160(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 160(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 160(%rdx), %xmm4
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 160(%rcx), %xmm4
@@ -8698,10 +8698,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 192(%rsi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movaps 192(%rdx), %xmm4
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps 192(%rdx), %xmm4
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm4, %xmm1
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE-NEXT:    movaps 192(%rcx), %xmm4
@@ -9646,20 +9646,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX-NEXT:    subq $3432, %rsp # imm = 0xD68
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovaps 224(%rdx), %ymm0
-; AVX-NEXT:    vmovaps 224(%rsi), %ymm2
-; AVX-NEXT:    vmovaps 224(%r8), %ymm4
-; AVX-NEXT:    vmovaps 224(%rdi), %ymm1
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 224(%rcx), %ymm5
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7]
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm5
+; AVX-NEXT:    vmovaps 224(%rsi), %ymm2
+; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 224(%r8), %ymm4
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 224(%rax), %ymm3
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7]
-; AVX-NEXT:    vmovaps %ymm1, %ymm5
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index bac4ff8ce434d1..d01c8b91de510c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -871,24 +871,24 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm3, %xmm2
 ; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm4
-; SSE-NEXT:    movaps 16(%rax), %xmm7
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
 ; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
-; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm14, %xmm2
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
-; SSE-NEXT:    movaps %xmm11, %xmm4
 ; SSE-NEXT:    unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3]
 ; SSE-NEXT:    movaps %xmm15, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm9, %xmm1
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSE-NEXT:    movaps 16(%rax), %xmm7
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm11, %xmm4
 ; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
 ; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movaps %xmm8, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
index 75b76f891d46ca..5bde9571124a6b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
@@ -844,26 +844,26 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    movapd 64(%rdi), %xmm5
 ; SSE-NEXT:    movapd (%rdi), %xmm1
 ; SSE-NEXT:    movapd (%rsi), %xmm4
-; SSE-NEXT:    movapd 16(%rsi), %xmm7
-; SSE-NEXT:    movapd 64(%rsi), %xmm9
-; SSE-NEXT:    movapd 48(%rdi), %xmm6
+; SSE-NEXT:    movapd (%rdx), %xmm0
+; SSE-NEXT:    movapd %xmm1, %xmm8
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0]
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; SSE-NEXT:    movapd 16(%rdi), %xmm0
 ; SSE-NEXT:    movapd 32(%rdi), %xmm3
-; SSE-NEXT:    movapd 16(%rdi), %xmm2
+; SSE-NEXT:    movapd 48(%rdi), %xmm6
+; SSE-NEXT:    movapd 64(%rsi), %xmm9
+; SSE-NEXT:    movapd 16(%rsi), %xmm7
 ; SSE-NEXT:    movapd 32(%rsi), %xmm11
 ; SSE-NEXT:    movapd 48(%rsi), %xmm10
 ; SSE-NEXT:    movapd 64(%rdx), %xmm15
-; SSE-NEXT:    movapd (%rdx), %xmm0
 ; SSE-NEXT:    movapd 16(%rdx), %xmm12
 ; SSE-NEXT:    movapd 32(%rdx), %xmm13
 ; SSE-NEXT:    movapd 48(%rdx), %xmm14
-; SSE-NEXT:    movapd %xmm1, %xmm8
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0]
 ; SSE-NEXT:    movapd %xmm8, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
 ; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm2
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0]
 ; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index d610029880f81a..f27c115ef4c5fc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -6746,15 +6746,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -6766,11 +6757,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -7041,15 +7041,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -7061,11 +7052,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -7336,15 +7336,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512DQ-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -7356,11 +7347,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512DQ-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -7631,15 +7631,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -7651,11 +7642,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -7926,15 +7926,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -7946,11 +7937,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -8221,15 +8221,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -8241,11 +8232,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -8516,15 +8516,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -8536,11 +8527,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512DQ-BW-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm12
@@ -8811,15 +8811,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rdx), %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
@@ -8831,11 +8822,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 384(%rcx), %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rcx), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm12
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 78a8042b3535ef..4de10147d80e52 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -7395,34 +7395,34 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $2264, %rsp # imm = 0x8D8
 ; AVX-NEXT:    vmovaps 96(%rdi), %ymm5
-; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
 ; AVX-NEXT:    vmovaps 64(%rcx), %ymm1
-; AVX-NEXT:    vmovaps (%rcx), %ymm2
 ; AVX-NEXT:    vmovaps 128(%rcx), %ymm0
+; AVX-NEXT:    vmovaps (%rcx), %ymm2
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
 ; AVX-NEXT:    vmovaps 16(%rdx), %xmm6
-; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
-; AVX-NEXT:    vmovaps 80(%rdx), %xmm3
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = mem[2,3,2,3]
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovaps 80(%rdx), %xmm3
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 96(%rcx), %xmm2
 ; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
 ; AVX-NEXT:    vmovaps 144(%rdx), %xmm2
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm14
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm4
+; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 651f851f9f6f96..4eea4707e120db 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -4443,6 +4443,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
+; AVX512DQ-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
 ; AVX512DQ-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
@@ -4457,7 +4458,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
@@ -4468,10 +4468,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
+; AVX512DQ-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm27
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
-; AVX512DQ-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-FCP-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
 ; AVX512DQ-FCP-NEXT:    kmovw %esi, %k3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
@@ -4481,6 +4481,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
+; AVX512DQ-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm26
@@ -4500,7 +4501,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
-; AVX512DQ-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
@@ -5267,6 +5267,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
+; AVX512DQ-BW-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
@@ -5281,7 +5282,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movb $112, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
@@ -5292,10 +5292,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    movb $96, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %esi, %k3
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
@@ -5305,6 +5305,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm26
@@ -5324,7 +5325,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    movb $120, %sil
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
@@ -16933,57 +16933,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm9
-; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm18
-; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512-NEXT:    kmovw %r10d, %k1
-; AVX512-NEXT:    movb $96, %r10b
-; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
-; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 (%rax), %zmm5
-; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
-; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm6
-; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
-; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
 ; AVX512-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
+; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    movb $96, %r10b
+; AVX512-NEXT:    kmovw %r10d, %k1
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm4
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    kmovw %r10d, %k2
 ; AVX512-NEXT:    movb $28, %r10b
-; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512-NEXT:    kmovw %r10d, %k2
 ; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
@@ -16999,18 +16984,32 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
 ; AVX512-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512-NEXT:    vpermt2q %zmm15, %zmm25, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
 ; AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpermt2q %zmm9, %zmm25, %zmm1
 ; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7]
 ; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm17
+; AVX512-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm18
+; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm6
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm1
 ; AVX512-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
@@ -17895,58 +17894,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
 ; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
 ; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movb $96, %r10b
-; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
 ; AVX512-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm8
 ; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm7
-; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512-FCP-NEXT:    movb $28, %r10b
 ; AVX512-FCP-NEXT:    kmovw %r10d, %k2
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
@@ -17964,18 +17949,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
 ; AVX512-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm18, %zmm23, %zmm1
 ; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
 ; AVX512-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm7
+; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm4
 ; AVX512-FCP-NEXT:    vpermt2q %zmm19, %zmm3, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r8), %zmm1
@@ -18854,55 +18854,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512DQ-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm15
-; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm14
-; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    movb $96, %r10b
-; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm5
 ; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    movb $96, %r10b
+; AVX512DQ-NEXT:    kmovw %r10d, %k1
+; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512DQ-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm6
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm16
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-NEXT:    movb $28, %r10b
 ; AVX512DQ-NEXT:    kmovw %r10d, %k2
@@ -18915,21 +18899,37 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm17, %zmm0
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm14, %zmm25, %zmm1
 ; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7]
 ; AVX512DQ-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm18
+; AVX512DQ-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQ-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm5
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
@@ -19808,56 +19808,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
-; AVX512DQ-FCP-NEXT:    vmovaps (%rdx), %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512DQ-FCP-NEXT:    movb $28, %r10b
 ; AVX512DQ-FCP-NEXT:    kmovw %r10d, %k2
 ; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
@@ -19873,7 +19858,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7]
@@ -19885,7 +19869,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
 ; AVX512DQ-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm16, %zmm8, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
@@ -20761,57 +20761,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512BW-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    movb $96, %r10b
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
-; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm6
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
-; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
 ; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    movb $96, %r10b
+; AVX512BW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512BW-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 64(%r8), %ymm4
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-NEXT:    movb $28, %r10b
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
@@ -20827,18 +20812,32 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm1
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm6
+; AVX512BW-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
@@ -21723,58 +21722,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-FCP-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
 ; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm18
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
 ; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    movb $96, %r10b
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm10
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm8
 ; AVX512BW-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa 64(%r8), %ymm7
-; AVX512BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512BW-FCP-NEXT:    movb $28, %r10b
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
@@ -21792,18 +21777,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
 ; AVX512BW-FCP-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm18, %zmm23, %zmm1
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
 ; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqa 64(%r8), %ymm7
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm4
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm19, %zmm3, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm1
@@ -22682,55 +22682,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-BW-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm14
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    movb $96, %r10b
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm5
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    movb $96, %r10b
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa 64(%r8), %ymm6
-; AVX512DQ-BW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm16
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-BW-NEXT:    movb $28, %r10b
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
@@ -22743,21 +22727,37 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm0
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm1
 ; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7]
 ; AVX512DQ-BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm18
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%r9), %ymm5
+; AVX512DQ-BW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
@@ -23636,56 +23636,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movb $96, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdx), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    movb $28, %r10b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
@@ -23701,7 +23686,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7]
@@ -23713,7 +23697,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm16, %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index 5c005567db232c..b98e79aa888674 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -18633,12 +18633,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-NEXT:    movb $-64, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
@@ -18674,20 +18672,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -18715,8 +18711,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18725,9 +18728,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -18738,9 +18738,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18749,9 +18749,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -18760,13 +18760,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -18775,9 +18775,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -19580,12 +19580,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512-FCP-NEXT:    movb $-64, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
@@ -19621,20 +19619,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -19662,8 +19658,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19672,9 +19675,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -19685,9 +19685,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19696,9 +19696,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -19707,13 +19707,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -19722,9 +19722,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -20527,12 +20527,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512DQ-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-NEXT:    movb $-64, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
@@ -20568,20 +20566,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512DQ-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -20609,8 +20605,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512DQ-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -20619,9 +20622,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512DQ-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -20632,9 +20632,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -20643,9 +20643,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -20654,13 +20654,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -20669,9 +20669,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512DQ-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512DQ-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -21474,12 +21474,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-FCP-NEXT:    movb $-64, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
@@ -21515,20 +21513,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -21556,8 +21552,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21566,9 +21569,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -21579,9 +21579,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21590,9 +21590,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -21601,13 +21601,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -21616,9 +21616,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -22421,12 +22421,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-NEXT:    movb $-64, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
@@ -22462,20 +22460,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -22503,8 +22499,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -22513,9 +22516,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -22526,9 +22526,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -22537,9 +22537,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -22548,13 +22548,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -22563,9 +22563,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -23368,12 +23368,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512BW-FCP-NEXT:    movb $-64, %r11b
 ; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
@@ -23409,20 +23407,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -23450,8 +23446,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -23460,9 +23463,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -23473,9 +23473,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -23484,9 +23484,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -23495,13 +23495,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -23510,9 +23510,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -24315,12 +24315,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-NEXT:    movb $-64, %r11b
 ; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
@@ -24356,20 +24354,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -24397,8 +24393,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -24407,9 +24410,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512DQ-BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -24420,9 +24420,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -24431,9 +24431,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -24442,13 +24442,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -24457,9 +24457,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm0
@@ -25262,12 +25262,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm30
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r10), %zmm26
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r10), %zmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rax), %zmm27
 ; AVX512DQ-BW-FCP-NEXT:    movb $-64, %r11b
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
@@ -25303,20 +25301,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm11
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm16
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
@@ -25344,8 +25340,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm22
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r10), %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -25354,9 +25357,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
@@ -25367,9 +25367,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -25378,9 +25378,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -25389,13 +25389,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm4
@@ -25404,9 +25404,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index c1e99368e9201a..c8f5ffbc13f2c1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -1659,7 +1659,6 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufb %xmm11, %xmm14, %xmm15
 ; AVX-NEXT:    vpor %xmm2, %xmm15, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
 ; AVX-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX-NEXT:    vpshufb %xmm11, %xmm13, %xmm15
 ; AVX-NEXT:    vpor %xmm6, %xmm15, %xmm0
@@ -1673,11 +1672,11 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpshufb %xmm11, %xmm15, %xmm10
 ; AVX-NEXT:    vpor %xmm7, %xmm10, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 16(%rdx), %xmm0
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
 ; AVX-NEXT:    vpshufb %xmm10, %xmm7, %xmm6
-; AVX-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15]
 ; AVX-NEXT:    vpshufb %xmm10, %xmm7, %xmm5
 ; AVX-NEXT:    vmovdqa 32(%rdx), %xmm11
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 5e87572af5dc14..c86f0db9306135 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -989,16 +989,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3]
-; SSE-NEXT:    movdqa %xmm6, %xmm5
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
+; SSE-NEXT:    movdqa %xmm6, %xmm5
 ; SSE-NEXT:    pandn %xmm3, %xmm5
 ; SSE-NEXT:    por %xmm1, %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
 ; SSE-NEXT:    pand %xmm2, %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
-; SSE-NEXT:    movdqa (%r8), %xmm0
 ; SSE-NEXT:    movdqa %xmm10, %xmm12
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; SSE-NEXT:    pand %xmm1, %xmm7
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1]
@@ -1012,10 +1010,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa %xmm2, %xmm10
 ; SSE-NEXT:    pandn %xmm11, %xmm10
 ; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa (%r8), %xmm5
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
 ; SSE-NEXT:    pand %xmm7, %xmm10
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[1,1,2,2]
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
 ; SSE-NEXT:    pandn %xmm11, %xmm0
 ; SSE-NEXT:    por %xmm10, %xmm0
@@ -1679,8 +1678,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2]
 ; SSE-NEXT:    movdqa %xmm3, %xmm15
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
 ; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7]
@@ -1690,7 +1689,6 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    por %xmm0, %xmm5
 ; SSE-NEXT:    movdqa %xmm8, %xmm0
 ; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    movdqa (%rcx), %xmm11
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
 ; SSE-NEXT:    pand %xmm10, %xmm0
@@ -1698,8 +1696,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa %xmm10, %xmm4
 ; SSE-NEXT:    pandn %xmm1, %xmm4
 ; SSE-NEXT:    por %xmm0, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa (%rcx), %xmm11
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index fbeecbc0a4ab29..3ea3a45156b96d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1768,10 +1768,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pandn %xmm2, %xmm7
 ; SSE-NEXT:    por %xmm6, %xmm7
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE-NEXT:    pand %xmm2, %xmm7
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm7
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm6, %xmm1
 ; SSE-NEXT:    por %xmm7, %xmm1
@@ -2232,18 +2232,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rax
 ; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r8), %ymm4
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm8
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm9
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm11
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -2260,12 +2255,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX2-NEXT:    vpblendvb %ymm14, %ymm9, %ymm12, %ymm9
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm12
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa (%r8), %ymm4
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm7
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm3, %ymm15
-; AVX2-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX2-NEXT:    vpshufb %ymm7, %ymm1, %ymm7
+; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23]
 ; AVX2-NEXT:    vmovdqa (%r9), %ymm7
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
@@ -2294,7 +2293,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm12
 ; AVX2-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
@@ -2366,19 +2365,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $40, %rsp
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm7
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm8
 ; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm6
+; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -2502,19 +2501,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $40, %rsp
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm7
 ; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm8
 ; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
 ; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -5453,22 +5452,22 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
 ; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
+; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm2[8],ymm11[9],ymm2[9],ymm11[10],ymm2[10],ymm11[11],ymm2[11],ymm11[12],ymm2[12],ymm11[13],ymm2[13],ymm11[14],ymm2[14],ymm11[15],ymm2[15],ymm11[24],ymm2[24],ymm11[25],ymm2[25],ymm11[26],ymm2[26],ymm11[27],ymm2[27],ymm11[28],ymm2[28],ymm11[29],ymm2[29],ymm11[30],ymm2[30],ymm11[31],ymm2[31]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
 ; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
@@ -5982,22 +5981,22 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm2[8],ymm11[9],ymm2[9],ymm11[10],ymm2[10],ymm11[11],ymm2[11],ymm11[12],ymm2[12],ymm11[13],ymm2[13],ymm11[14],ymm2[14],ymm11[15],ymm2[15],ymm11[24],ymm2[24],ymm11[25],ymm2[25],ymm11[26],ymm2[26],ymm11[27],ymm2[27],ymm11[28],ymm2[28],ymm11[29],ymm2[29],ymm11[30],ymm2[30],ymm11[31],ymm2[31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
 ; AVX512DQ-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 14e5f65407942e..df7013c2378e70 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2684,29 +2684,29 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm4
 ; SSE-NEXT:    movdqa 16(%rdx), %xmm3
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6]
-; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%r8), %xmm6
 ; SSE-NEXT:    movdqa %xmm1, %xmm15
-; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 16(%r9), %xmm5
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; SSE-NEXT:    pand %xmm10, %xmm0
-; SSE-NEXT:    movdqa 16(%rcx), %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm8
-; SSE-NEXT:    movdqa %xmm4, %xmm13
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7]
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    movdqa %xmm10, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm11
 ; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    movdqa 16(%rcx), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 16(%r8), %xmm6
+; SSE-NEXT:    movdqa 16(%r9), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm13
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm11
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6]
 ; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4340,7 +4340,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
 ; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
@@ -4352,6 +4351,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm12
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
@@ -4369,7 +4369,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
 ; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm12
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
 ; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
@@ -4619,7 +4618,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
 ; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
@@ -4631,6 +4629,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm12
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
@@ -4648,7 +4647,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
 ; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm12
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
 ; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
@@ -5329,10 +5327,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3]
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm2, %xmm11
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm6, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3]
 ; SSE-NEXT:    movdqa %xmm6, %xmm2
@@ -5928,11 +5926,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    pandn %xmm10, %xmm15
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4]
+; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    movdqa %xmm6, %xmm2
 ; SSE-NEXT:    pandn %xmm10, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
-; SSE-NEXT:    pand %xmm6, %xmm10
+; SSE-NEXT:    pand %xmm0, %xmm10
 ; SSE-NEXT:    por %xmm10, %xmm2
 ; SSE-NEXT:    pand %xmm12, %xmm2
 ; SSE-NEXT:    por %xmm15, %xmm2
@@ -5941,9 +5940,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm0, %xmm15
 ; SSE-NEXT:    pandn %xmm10, %xmm15
+; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
-; SSE-NEXT:    movdqa %xmm13, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm10
 ; SSE-NEXT:    por %xmm10, %xmm15
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 39b012bcf8d4e4..a75b5f22d1605b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -3041,27 +3041,27 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm9
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm13, %ymm13
-; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm15
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm8
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm8
+; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm15
+; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
@@ -3079,8 +3079,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-NEXT:    vmovdqa %xmm9, %xmm5
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3]
diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll
index 6441a83a4e326b..aa01a42b867612 100644
--- a/llvm/test/CodeGen/X86/vector-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll
@@ -239,15 +239,15 @@ define void @b(ptr %p, ptr %q) nounwind {
 ; CHECK-NEXT:    movaps 32(%rdi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movaps (%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps 16(%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps 48(%rdi), %xmm2
 ; CHECK-NEXT:    movaps 32(%rsi), %xmm0
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps 48(%rdi), %xmm2
+; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps 48(%rsi), %xmm1
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index d6171235aa2c46..d8c9102ecc6592 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -686,12 +686,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm5[2,3]
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2]
 ; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; X86-AVX1-NEXT:    vmovaps %ymm4, %ymm1
+; X86-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2]
+; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3]
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    retl
@@ -717,9 +716,8 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; X86-AVX2-NEXT:    vmovaps %ymm5, %ymm0
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
 ; X86-AVX2-NEXT:    vmovaps %ymm4, %ymm1
 ; X86-AVX2-NEXT:    movl %ebp, %esp
 ; X86-AVX2-NEXT:    popl %ebp
@@ -740,13 +738,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; X64-AVX1-NEXT:    vmovaps %ymm4, %ymm1
-; X64-AVX1-NEXT:    vmovaps %ymm5, %ymm3
+; X64-AVX1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: bit_reversal_permutation:
@@ -754,20 +750,19 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
 ; X64-AVX2-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
 ; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
+; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
 ; X64-AVX2-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm3[0]
 ; X64-AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
 ; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; X64-AVX2-NEXT:    vmovaps %ymm6, %ymm0
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
 ; X64-AVX2-NEXT:    vmovaps %ymm4, %ymm1
 ; X64-AVX2-NEXT:    retq
   %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 2, i32 3, i32 6, i32 7, i32 8, i32 9, i32 12, i32 13, i32 10, i32 11, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 90b6beeae516de..f4c2eb5f213d9f 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -130,9 +130,8 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_32i8_to_32i16:
@@ -587,9 +586,8 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_16i16_to_16i32:
@@ -884,9 +882,8 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i32_to_8i64:
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 35b90a4b2205f2..f84131dfc87970 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -3220,8 +3220,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index bf330de825966a..7dc6956120d3a1 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1277,34 +1277,33 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm11
 ; AVX1-NEXT:    vmovups 64(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm13
 ; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm5
-; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
-; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm10
 ; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm2
-; AVX1-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm10
 ; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm5, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm10, %xmm7
-; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
-; AVX1-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm2, %xmm12
-; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpor %xmm5, %xmm12, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm3, %xmm12
 ; AVX1-NEXT:    vpor %xmm10, %xmm12, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm13
+; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
+; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
+; AVX1-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
+; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm15, %xmm1, %xmm12
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm15, %xmm0, %xmm12
 ; AVX1-NEXT:    vpor %xmm11, %xmm12, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm13, %xmm11
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 150385ffd8aa8c..a2bc5c6eeeb79a 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -210,14 +210,14 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    addl %esi, %ecx
 ; WIN32-NEXT:    movl %edi, %eax
@@ -973,14 +973,14 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    addl %esi, %ecx
 ; WIN32-NEXT:    movl %edi, %eax

>From 45e229df9fb26242e2ae0236f1c446ba2ea1bbfe Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 22 Aug 2024 21:44:40 +0200
Subject: [PATCH 05/15] Fix safety tests

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 750eaf2de48570..3c59d3fcadaecf 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -125,12 +125,12 @@ static std::optional<llvm::SmallVector<MachineInstr *>>
 moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   MachineInstr *DstInstr = Dst->getInstr();
   MachineInstr *SrcInstr = Src->getInstr();
-  MachineBasicBlock *MBB = SrcInstr->getParent();
-
   if (DstInstr == nullptr || SrcInstr == nullptr)
     return {};
+
+  MachineBasicBlock *MBB = SrcInstr->getParent();
   assert("This function only operates on a basic block level." &&
-         MBB == SrcInstr->getParent());
+         MBB == DstInstr->getParent());
 
   int SectionSize =
       std::distance(SrcInstr->getIterator(), DstInstr->getIterator());

>From 793dd6b63534fa7dd0519d255dcc4b0fc8e92627 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 22 Aug 2024 22:07:02 +0200
Subject: [PATCH 06/15] Make sure that copy src precedes copy dst

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 3c59d3fcadaecf..8143398d550b55 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -132,7 +132,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   assert("This function only operates on a basic block level." &&
          MBB == DstInstr->getParent());
 
-  int SectionSize =
+  assert(std::distance(SrcInstr->getIterator(), DstInstr->getIterator()) > 0 &&
+         "The copy source must precede the copy destination.");
+  unsigned SectionSize =
       std::distance(SrcInstr->getIterator(), DstInstr->getIterator());
 
   // The bit vector representing the instructions in the section.

>From 5420aa6005d7401188fa51e591dc20f6c138a2c3 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 22 Aug 2024 22:21:08 +0200
Subject: [PATCH 07/15] Keep using int for the section size. It is compared
 with possibly negative int and also good for the assertion

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 8143398d550b55..ccf7e9d39003bf 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -132,11 +132,13 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   assert("This function only operates on a basic block level." &&
          MBB == DstInstr->getParent());
 
-  assert(std::distance(SrcInstr->getIterator(), DstInstr->getIterator()) > 0 &&
-         "The copy source must precede the copy destination.");
-  unsigned SectionSize =
+  
+  int SectionSize =
       std::distance(SrcInstr->getIterator(), DstInstr->getIterator());
 
+  assert(SectionSize > 0 &&
+         "The copy source must precede the copy destination.");
+
   // The bit vector representing the instructions in the section.
   // This vector stores which instruction needs to be moved and which does not.
   BitVector SectionInstr(SectionSize, false);

>From ca2e3a93a77f6fc572420b6ee3ea561a94b7dc87 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 22 Aug 2024 22:28:59 +0200
Subject: [PATCH 08/15] Capture the Queue instead of passing it as an argument

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index ccf7e9d39003bf..b53e4b12467c75 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -152,8 +152,7 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   // (only if we are not talking about the destination node which is a special
   // case indicated by a flag) and is located between the source of the copy and
   // the destination of the copy.
-  auto ProcessSNodeChildren = [SrcInstr, &SectionSize, &SectionInstr](
-                                  std::queue<const SUnit *> &Queue,
+  auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr](
                                   const SUnit *Node, bool IsRoot) -> bool {
     for (llvm::SDep I : Node->Preds) {
       SUnit *SU = I.getSUnit();
@@ -171,7 +170,7 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
         // dependence. We do not need to do anything with it again.
         if (!SectionInstr[DestinationFromSource]) {
           SectionInstr[DestinationFromSource] = true;
-          Queue.push(SU);
+          Edges.push(SU);
         }
       }
     }
@@ -197,11 +196,11 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   // source. To decide if we have it as dependency of another instruction, we
   // must check in the already traversed list if any of the instructions that is
   // depended on the source is contained. This would introduce extra costs.
-  ProcessSNodeChildren(Edges, Dst, true);
+  ProcessSNodeChildren(Dst, true);
   while (!Edges.empty()) {
     const auto *Current = Edges.front();
     Edges.pop();
-    if (!ProcessSNodeChildren(Edges, Current, false))
+    if (!ProcessSNodeChildren(Current, false))
       return {};
   }
 

>From b9908c44b74d7e2f077b4fbf51c75b627706b416 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 22 Aug 2024 23:18:50 +0200
Subject: [PATCH 09/15] One construction for the SchedulerDAG

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 47 ++++++++++-----------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index b53e4b12467c75..95160c31320e4f 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -66,6 +66,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -586,11 +587,11 @@ class MachineCopyPropagation : public MachineFunctionPass {
   void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT);
   void readSuccessorLiveIns(const MachineBasicBlock &MBB);
   void ForwardCopyPropagateBlock(MachineBasicBlock &MBB);
-  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, bool ResolveAntiDeps = false);
+  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, ScheduleDAGMCP *DG = nullptr);
   void EliminateSpillageCopies(MachineBasicBlock &MBB);
   bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def);
   void forwardUses(MachineInstr &MI);
-  void propagateDefs(MachineInstr &MI, ScheduleDAGMCP &DG, bool ResolveAntiDeps = false);
+  void propagateDefs(MachineInstr &MI, ScheduleDAGMCP *DG = nullptr);
   bool isForwardableRegClassCopy(const MachineInstr &Copy,
                                  const MachineInstr &UseI, unsigned UseIdx);
   bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy,
@@ -1158,9 +1159,8 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands,
   return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill();
 }
 
-void MachineCopyPropagation::propagateDefs(
-    MachineInstr &MI, ScheduleDAGMCP &DG,
-    bool MoveDependenciesForBetterCopyPropagation) {
+void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
+                                           ScheduleDAGMCP *DG) {
   if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies())
     return;
 
@@ -1186,7 +1186,7 @@ void MachineCopyPropagation::propagateDefs(
     MachineInstr *Copy = Tracker.findAvailBackwardCopy(
         MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr);
     if (!Copy) {
-      if (!MoveDependenciesForBetterCopyPropagation)
+      if (!DG)
         continue;
 
       LLVM_DEBUG(
@@ -1203,8 +1203,8 @@ void MachineCopyPropagation::propagateDefs(
       LLVM_DEBUG(
           dbgs()
           << "MCP: Found potential backward copy that has dependency.\n");
-      SUnit *DstSUnit = DG.getSUnit(Copy);
-      SUnit *SrcSUnit = DG.getSUnit(&MI);
+      SUnit *DstSUnit = DG->getSUnit(Copy);
+      SUnit *SrcSUnit = DG->getSUnit(&MI);
 
       InstructionsToMove =
           moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit);
@@ -1232,7 +1232,7 @@ void MachineCopyPropagation::propagateDefs(
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
-    if (!MoveDependenciesForBetterCopyPropagation) {
+    if (!DG) {
       MODef.setReg(Def);
       MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
@@ -1249,12 +1249,11 @@ void MachineCopyPropagation::propagateDefs(
 }
 
 void MachineCopyPropagation::BackwardCopyPropagateBlock(
-    MachineBasicBlock &MBB, bool MoveDependenciesForBetterCopyPropagation) {
-  ScheduleDAGMCP DG{*(MBB.getParent()), nullptr, false};
-  if (MoveDependenciesForBetterCopyPropagation) {
-    DG.startBlock(&MBB);
-    DG.enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size());
-    DG.buildSchedGraph(nullptr);
+    MachineBasicBlock &MBB, ScheduleDAGMCP *DG) {
+  if (DG) {
+    DG->startBlock(&MBB);
+    DG->enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size());
+    DG->buildSchedGraph(nullptr);
     // DG.viewGraph();
   }
  
@@ -1277,7 +1276,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         // just let forward cp do COPY-to-COPY propagation.
         if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) {
           Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, UseCopyInstr,
-                                     MoveDependenciesForBetterCopyPropagation);
+                                     DG);
           Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII,
                                      UseCopyInstr);
           Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
@@ -1295,7 +1294,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr, false);
       }
 
-    propagateDefs(MI, DG, MoveDependenciesForBetterCopyPropagation);
+    propagateDefs(MI, DG);
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
@@ -1320,7 +1319,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         } else {
           Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
                                      UseCopyInstr,
-                                     MoveDependenciesForBetterCopyPropagation);
+                                     DG);
         }
       }
     }
@@ -1338,13 +1337,13 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     Copy->eraseFromParent();
     ++NumDeletes;
   }
-  if (MoveDependenciesForBetterCopyPropagation) {
-    DG.exitRegion();
-    DG.finishBlock();
+  if (DG) {
+    DG->exitRegion();
+    DG->finishBlock();
     // QUESTION: Does it makes sense to keep the kill flags here?
     //           On the other parts of this pass we juts throw out
     //           the kill flags.
-    DG.fixupKills(MBB);
+    DG->fixupKills(MBB);
   }
 
 
@@ -1699,7 +1698,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
   LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
-
+  ScheduleDAGMCP DG{MF, nullptr, false};
   for (MachineBasicBlock &MBB : MF) {
     if (isSpillageCopyElimEnabled)
       EliminateSpillageCopies(MBB);
@@ -1716,7 +1715,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
     // The renaming wouldn't happen instantly. There would be a data structure
     // that contained what register should be renamed to what. Then after the
     // backward propagation has concluded the renaming would happen.
-    BackwardCopyPropagateBlock(MBB, true);
+    BackwardCopyPropagateBlock(MBB, &DG);
     // Then we do the actual copy propagation.
     BackwardCopyPropagateBlock(MBB);
 

>From 5561de96db79a30fbe14bc58c46e45957c790124 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <Gabor.Spaits at hightec-rt.com>
Date: Tue, 27 Aug 2024 12:56:45 +0200
Subject: [PATCH 10/15] Use a limit for the search

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 95160c31320e4f..2780e569426f90 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -124,6 +124,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs {
 
 static std::optional<llvm::SmallVector<MachineInstr *>>
 moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
+  unsigned MaxNumberOfNodesToBeProcessed = 25;
   MachineInstr *DstInstr = Dst->getInstr();
   MachineInstr *SrcInstr = Src->getInstr();
   if (DstInstr == nullptr || SrcInstr == nullptr)
@@ -147,13 +148,15 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   // The queue for the breadth first search.
   std::queue<const SUnit *> Edges;
 
+  unsigned NumProcessedNode = 0;
+
   // Process the children of a node.
   // Basically every node are checked before it is being put into the queue.
   // A node is enqueued if it has no dependencies on the source of the copy
   // (only if we are not talking about the destination node which is a special
   // case indicated by a flag) and is located between the source of the copy and
   // the destination of the copy.
-  auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr](
+  auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr, &NumProcessedNode, &MaxNumberOfNodesToBeProcessed](
                                   const SUnit *Node, bool IsRoot) -> bool {
     for (llvm::SDep I : Node->Preds) {
       SUnit *SU = I.getSUnit();
@@ -174,8 +177,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
           Edges.push(SU);
         }
       }
+      NumProcessedNode++;
     }
-    return true;
+    return NumProcessedNode < MaxNumberOfNodesToBeProcessed;      
   };
 
   // The BFS happens here.

>From a5a2766a85703c60cd2c062581f6d9b65b4883f2 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <Gabor.Spaits at hightec-rt.com>
Date: Tue, 27 Aug 2024 20:42:17 +0200
Subject: [PATCH 11/15] Decrease the node limit to 10

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 2780e569426f90..62452b9211e301 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -124,7 +124,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs {
 
 static std::optional<llvm::SmallVector<MachineInstr *>>
 moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
-  unsigned MaxNumberOfNodesToBeProcessed = 25;
+  unsigned MaxNumberOfNodesToBeProcessed = 10;
   MachineInstr *DstInstr = Dst->getInstr();
   MachineInstr *SrcInstr = Src->getInstr();
   if (DstInstr == nullptr || SrcInstr == nullptr)
@@ -175,9 +175,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
         if (!SectionInstr[DestinationFromSource]) {
           SectionInstr[DestinationFromSource] = true;
           Edges.push(SU);
+          NumProcessedNode++;
         }
       }
-      NumProcessedNode++;
     }
     return NumProcessedNode < MaxNumberOfNodesToBeProcessed;      
   };

>From 9d87620f210d832398a9cd8f039689d1007486b9 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <Gabor.Spaits at hightec-rt.com>
Date: Thu, 29 Aug 2024 18:59:39 +0200
Subject: [PATCH 12/15] Just build the graph for copy regions

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 31 +++++++++++----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 62452b9211e301..fbe5e35aa4874e 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -123,20 +123,24 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs {
 };
 
 static std::optional<llvm::SmallVector<MachineInstr *>>
-moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
+moveInstructionsOutOfTheWayIfWeCan(MachineInstr *DstInstr, MachineInstr *SrcInstr, ScheduleDAGMCP &DG) {
+  SUnit *Dst;
+  //SUnit *Src;
+
+  MachineBasicBlock *MBB = SrcInstr->getParent();
+  int SectionSize =
+      std::distance(SrcInstr->getIterator(), DstInstr->getIterator());
+
+  DG.enterRegion(MBB, (SrcInstr->getIterator()), ++(DstInstr->getIterator()), SectionSize+1);
+  DG.buildSchedGraph(nullptr);
+  Dst = DG.getSUnit(DstInstr);
   unsigned MaxNumberOfNodesToBeProcessed = 10;
-  MachineInstr *DstInstr = Dst->getInstr();
-  MachineInstr *SrcInstr = Src->getInstr();
   if (DstInstr == nullptr || SrcInstr == nullptr)
     return {};
 
-  MachineBasicBlock *MBB = SrcInstr->getParent();
   assert("This function only operates on a basic block level." &&
          MBB == DstInstr->getParent());
 
-  
-  int SectionSize =
-      std::distance(SrcInstr->getIterator(), DstInstr->getIterator());
 
   assert(SectionSize > 0 &&
          "The copy source must precede the copy destination.");
@@ -205,10 +209,14 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) {
   while (!Edges.empty()) {
     const auto *Current = Edges.front();
     Edges.pop();
-    if (!ProcessSNodeChildren(Current, false))
+    if (!ProcessSNodeChildren(Current, false)) {
+      DG.exitRegion();
       return {};
+    }
   }
 
+  DG.exitRegion();
+
   // If all of the dependencies were deemed valid during the BFS then we
   // are moving them before the copy source here keeping their relative
   // order to each other.
@@ -1207,11 +1215,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
       LLVM_DEBUG(
           dbgs()
           << "MCP: Found potential backward copy that has dependency.\n");
-      SUnit *DstSUnit = DG->getSUnit(Copy);
-      SUnit *SrcSUnit = DG->getSUnit(&MI);
 
       InstructionsToMove =
-          moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit);
+          moveInstructionsOutOfTheWayIfWeCan(Copy, &MI, *DG);
       if (!InstructionsToMove)
         continue;
     }
@@ -1256,8 +1262,6 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     MachineBasicBlock &MBB, ScheduleDAGMCP *DG) {
   if (DG) {
     DG->startBlock(&MBB);
-    DG->enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size());
-    DG->buildSchedGraph(nullptr);
     // DG.viewGraph();
   }
  
@@ -1342,7 +1346,6 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     ++NumDeletes;
   }
   if (DG) {
-    DG->exitRegion();
     DG->finishBlock();
     // QUESTION: Does it makes sense to keep the kill flags here?
     //           On the other parts of this pass we juts throw out

>From a34e68d6390cf23a88315e3b3527748f7724639c Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Fri, 23 Aug 2024 12:48:32 +0200
Subject: [PATCH 13/15] Fix a typo

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index fbe5e35aa4874e..c1fe3f196469f0 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -201,7 +201,7 @@ moveInstructionsOutOfTheWayIfWeCan(MachineInstr *DstInstr, MachineInstr *SrcInst
   // this scenario this must be ignored. Let's say that we can not control what
   // nodes to process and we come across the copy source. How do I know what
   // node has that copy source as their dependency? We can check of which node
-  // is the copy source the dependency of. This list will alway contain the
+  // is the copy source the dependency of. This list will always contain the
   // source. To decide if we have it as dependency of another instruction, we
   // must check in the already traversed list if any of the instructions that is
   // depended on the source is contained. This would introduce extra costs.

>From f6ca9f674ac27cc31a67a47d2bcf13e4880a67b7 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <Gabor.Spaits at hightec-rt.com>
Date: Fri, 30 Aug 2024 13:53:04 +0200
Subject: [PATCH 14/15] Prot back to one stage

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 62 +++++++------------
 .../CodeGen/AArch64/anti-dependencies-mcp.mir |  2 +-
 2 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index c1fe3f196469f0..7618bfb83de920 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -945,7 +945,6 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     MOUse.setIsUndef(CopySrc.isUndef());
 
     LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
-
     // Clear kill markers that may have been invalidated.
     for (MachineInstr &KMI :
          make_range(Copy->getIterator(), std::next(MI.getIterator())))
@@ -1238,32 +1237,31 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
 
     if (hasOverlappingMultipleDef(MI, MODef, Def))
       continue;
-
+    if (InstructionsToMove)
+      for (auto *I : *InstructionsToMove) {
+        MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator());
+      }
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
-    if (!DG) {
-      MODef.setReg(Def);
-      MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
-      LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
-      MaybeDeadCopies.insert(Copy);
-      Changed = true;
-      ++NumCopyBackwardPropagated;
-    } else if (InstructionsToMove) {
-      for (auto *I : *InstructionsToMove) {
-        MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator());
-      }
-    }
+    
+    MODef.setReg(Def);
+    MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
+
+    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+    MaybeDeadCopies.insert(Copy);
+    Changed = true;
+    ++NumCopyBackwardPropagated;
+     
+      
   }
 }
 
 void MachineCopyPropagation::BackwardCopyPropagateBlock(
     MachineBasicBlock &MBB, ScheduleDAGMCP *DG) {
-  if (DG) {
-    DG->startBlock(&MBB);
-    // DG.viewGraph();
-  }
+  DG->startBlock(&MBB);
+  // DG.viewGraph();
  
 
   LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
@@ -1345,13 +1343,12 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     Copy->eraseFromParent();
     ++NumDeletes;
   }
-  if (DG) {
-    DG->finishBlock();
-    // QUESTION: Does it makes sense to keep the kill flags here?
-    //           On the other parts of this pass we juts throw out
-    //           the kill flags.
-    DG->fixupKills(MBB);
-  }
+  
+  DG->finishBlock();
+  // QUESTION: Does it makes sense to keep the kill flags here?
+  //           On the other parts of this pass we juts throw out
+  //           the kill flags.
+  DG->fixupKills(MBB);
 
 
   MaybeDeadCopies.clear();
@@ -1709,22 +1706,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     if (isSpillageCopyElimEnabled)
       EliminateSpillageCopies(MBB);
-
-    // BackwardCopyPropagateBlock happens in two stages.
-    // First we move those unnecessary dependencies out of the way
-    // that may block copy propagations.
-    //
-    // The reason for this two stage approach is that the ScheduleDAG can not
-    // handle register renaming.
-    // QUESTION: I think these two stages could be merged together, if I were to change
-    // the renaming mechanism.
-    //
-    // The renaming wouldn't happen instantly. There would be a data structure
-    // that contained what register should be renamed to what. Then after the
-    // backward propagation has concluded the renaming would happen.
     BackwardCopyPropagateBlock(MBB, &DG);
-    // Then we do the actual copy propagation.
-    BackwardCopyPropagateBlock(MBB);
 
     ForwardCopyPropagateBlock(MBB);
   }
diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
index c3a59990ccd255..abf668533fd011 100644
--- a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
+++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
@@ -177,9 +177,9 @@ body:             |
     ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1
     ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-    ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
     ; CHECK-NEXT: renamable $w1 = COPY $w0
     ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0
+    ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
     ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
     ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp

>From 027d7761dbb0293451a2c00e32cd6dc5ce83252c Mon Sep 17 00:00:00 2001
From: Gabor Spaits <Gabor.Spaits at hightec-rt.com>
Date: Fri, 30 Aug 2024 17:28:39 +0200
Subject: [PATCH 15/15] Revert "Prot back to one stage"

This reverts commit f6ca9f674ac27cc31a67a47d2bcf13e4880a67b7.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 62 ++++++++++++-------
 .../CodeGen/AArch64/anti-dependencies-mcp.mir |  2 +-
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 7618bfb83de920..c1fe3f196469f0 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -945,6 +945,7 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     MOUse.setIsUndef(CopySrc.isUndef());
 
     LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+
     // Clear kill markers that may have been invalidated.
     for (MachineInstr &KMI :
          make_range(Copy->getIterator(), std::next(MI.getIterator())))
@@ -1237,31 +1238,32 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI,
 
     if (hasOverlappingMultipleDef(MI, MODef, Def))
       continue;
-    if (InstructionsToMove)
-      for (auto *I : *InstructionsToMove) {
-        MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator());
-      }
+
     LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
                       << "\n     with " << printReg(Def, TRI) << "\n     in "
                       << MI << "     from " << *Copy);
+    if (!DG) {
+      MODef.setReg(Def);
+      MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
 
-    
-    MODef.setReg(Def);
-    MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
-
-    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
-    MaybeDeadCopies.insert(Copy);
-    Changed = true;
-    ++NumCopyBackwardPropagated;
-     
-      
+      LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+      MaybeDeadCopies.insert(Copy);
+      Changed = true;
+      ++NumCopyBackwardPropagated;
+    } else if (InstructionsToMove) {
+      for (auto *I : *InstructionsToMove) {
+        MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator());
+      }
+    }
   }
 }
 
 void MachineCopyPropagation::BackwardCopyPropagateBlock(
     MachineBasicBlock &MBB, ScheduleDAGMCP *DG) {
-  DG->startBlock(&MBB);
-  // DG.viewGraph();
+  if (DG) {
+    DG->startBlock(&MBB);
+    // DG.viewGraph();
+  }
  
 
   LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
@@ -1343,12 +1345,13 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
     Copy->eraseFromParent();
     ++NumDeletes;
   }
-  
-  DG->finishBlock();
-  // QUESTION: Does it makes sense to keep the kill flags here?
-  //           On the other parts of this pass we juts throw out
-  //           the kill flags.
-  DG->fixupKills(MBB);
+  if (DG) {
+    DG->finishBlock();
+    // QUESTION: Does it makes sense to keep the kill flags here?
+    //           On the other parts of this pass we juts throw out
+    //           the kill flags.
+    DG->fixupKills(MBB);
+  }
 
 
   MaybeDeadCopies.clear();
@@ -1706,7 +1709,22 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     if (isSpillageCopyElimEnabled)
       EliminateSpillageCopies(MBB);
+
+    // BackwardCopyPropagateBlock happens in two stages.
+    // First we move those unnecessary dependencies out of the way
+    // that may block copy propagations.
+    //
+    // The reason for this two stage approach is that the ScheduleDAG can not
+    // handle register renaming.
+    // QUESTION: I think these two stages could be merged together, if I were to change
+    // the renaming mechanism.
+    //
+    // The renaming wouldn't happen instantly. There would be a data structure
+    // that contained what register should be renamed to what. Then after the
+    // backward propagation has concluded the renaming would happen.
     BackwardCopyPropagateBlock(MBB, &DG);
+    // Then we do the actual copy propagation.
+    BackwardCopyPropagateBlock(MBB);
 
     ForwardCopyPropagateBlock(MBB);
   }
diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
index abf668533fd011..c3a59990ccd255 100644
--- a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
+++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir
@@ -177,9 +177,9 @@ body:             |
     ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1
     ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
     ; CHECK-NEXT: renamable $w1 = COPY $w0
     ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0
-    ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c)
     ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
     ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp



More information about the llvm-commits mailing list