[llvm] [X86] Eliminate redundant zero-extension instructions (PR #161401)

Francisco Geiman Thiesen via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 2 20:22:30 PDT 2025


https://github.com/FranciscoThiesen updated https://github.com/llvm/llvm-project/pull/161401

>From 622f767197a08e7a2fd307b7a4c9652540d25432 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Tue, 30 Sep 2025 09:52:34 -0700
Subject: [PATCH 1/5] [X86] Eliminate redundant zero-extension instructions
 This pass eliminates redundant MOVZX32rr8 instructions when the source
 register is a sub-register of the destination and the destination's upper
 bits are already known to be zero.

For example, in loops processing byte values:
```
  movzbl (%rdi), %ecx  ; ECX upper 24 bits are zero
  ...
  movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
```

The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)

This commonly occurs in loops that process byte values, saving one
instruction per loop iteration and reducing code size by 3 bytes.
---
 llvm/lib/Target/X86/CMakeLists.txt            |   1 +
 llvm/lib/Target/X86/X86.h                     |   4 +
 .../X86/X86EliminateRedundantZeroExtend.cpp   | 292 ++++++++++++++++++
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   1 +
 .../CodeGen/X86/eliminate-redundant-zext.ll   |  63 ++++
 5 files changed, 361 insertions(+)
 create mode 100644 llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
 create mode 100644 llvm/test/CodeGen/X86/eliminate-redundant-zext.ll

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..351ba623e2b6d 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
   X86FixupVectorConstants.cpp
   X86AvoidStoreForwardingBlocks.cpp
   X86DynAllocaExpander.cpp
+  X86EliminateRedundantZeroExtend.cpp
   X86FixupSetCC.cpp
   X86FlagsCopyLowering.cpp
   X86FloatingPoint.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6261fadf10a7a..cd59eb5c80149 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass();
 /// the upper portions of registers, and to save code size.
 FunctionPass *createX86FixupBWInsts();
 
+/// Return a Machine IR pass that eliminates redundant zero-extension
+/// instructions where the upper bits are already known to be zero.
+FunctionPass *createX86EliminateRedundantZeroExtend();
+
 /// Return a Machine IR pass that reassigns instruction chains from one domain
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
new file mode 100644
index 0000000000000..72717b1c64794
--- /dev/null
+++ b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
@@ -0,0 +1,292 @@
+//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass eliminates redundant zero-extension instructions where the source
+/// register is a sub-register of the destination and the destination's upper
+/// bits are known to be zero.
+///
+/// For example:
+///   movzbl (%rdi), %ecx  ; ECX = zero-extend byte, upper 24 bits are zero
+///   ...
+///   movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
+///
+/// This pattern commonly occurs in loops processing byte values.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-eliminate-zext"
+#define PASS_NAME "X86 Eliminate Redundant Zero Extension"
+
+namespace {
+class EliminateRedundantZeroExtend : public MachineFunctionPass {
+public:
+  static char ID;
+  EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setNoVRegs();
+  }
+
+private:
+  const X86InstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+
+  /// Check if the register's upper bits are known to be zero at this point.
+  /// This checks backward from MI to find the most recent definition of Reg.
+  bool hasZeroUpperBits(Register Reg, const MachineInstr &MI,
+                        const MachineBasicBlock &MBB) const;
+
+  /// Try to eliminate a redundant MOVZX instruction.
+  bool tryEliminateRedundantZeroExtend(MachineInstr &MI,
+                                       MachineBasicBlock &MBB) const;
+};
+
+char EliminateRedundantZeroExtend::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86EliminateRedundantZeroExtend() {
+  return new EliminateRedundantZeroExtend();
+}
+
+bool EliminateRedundantZeroExtend::hasZeroUpperBits(
+    Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const {
+  // Walk backward from MI to find the most recent definition of Reg
+  MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator();
+  MachineBasicBlock::const_reverse_iterator E = MBB.rend();
+  for (; I != E; ++I) {
+    const MachineInstr &Inst = *I;
+
+    // Check if this instruction defines Reg
+    for (const MachineOperand &MO : Inst.operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      Register DefReg = MO.getReg();
+      if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+        // Found a definition - check if it zeros upper bits
+        unsigned Opc = Inst.getOpcode();
+        switch (Opc) {
+        // These instructions zero-extend to 32 bits
+        case X86::MOVZX32rm8:
+        case X86::MOVZX32rr8:
+        case X86::MOVZX32rm16:
+        case X86::MOVZX32rr16:
+          return true;
+        // XOR with self zeros the register
+        case X86::XOR32rr:
+          if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+            return true;
+          return false;
+        // MOV32r0 explicitly zeros
+        case X86::MOV32r0:
+          return true;
+        // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit)
+        case X86::ADD32rr:
+        case X86::ADD32ri:
+        case X86::ADD32rm:
+        case X86::SUB32rr:
+        case X86::SUB32ri:
+        case X86::SUB32rm:
+        case X86::LEA32r:
+          return true;
+        default:
+          // Any other definition might set upper bits, so not safe
+          return false;
+        }
+      }
+
+      // Check if this instruction modifies Reg (partial write or implicit use)
+      if (TRI->regsOverlap(DefReg, Reg)) {
+        // Partial register update - upper bits are unknown
+        return false;
+      }
+    }
+
+    // Check for implicit defs
+    for (const MachineOperand &MO : Inst.implicit_operands()) {
+      if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) {
+        return false;
+      }
+    }
+  }
+
+  // Didn't find a definition in this block - check predecessors
+  // If all predecessors define Reg with zero upper bits, it's safe
+  if (MBB.pred_empty())
+    return false;
+
+  // Check all predecessor blocks
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    bool FoundZeroExtend = false;
+
+    // SAFETY CHECK: If the sub-register is live-in to the predecessor,
+    // we make the CONSERVATIVE assumption that the parent register was
+    // zero-extended in an earlier block.
+    //
+    // This is safe because:
+    // 1. After register allocation, if $cl is live-in but $ecx is not,
+    //    it means only the low 8 bits are meaningful
+    // 2. The register allocator ensures no other code modifies $ecx between
+    //    the zero-extension and this point (otherwise $ecx would be live)
+    // 3. Any write to $ch or upper bits would show as a def of $ecx, which
+    //    would be found in our backward scan below and handled correctly
+    //
+    // However, this is still conservative - we should verify the actual
+    // definition to be completely safe.
+    Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit);
+    Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit);
+    bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) ||
+                        (SubReg16 && Pred->isLiveIn(SubReg16));
+
+    if (SubRegLiveIn) {
+      // Sub-register is live-in. We'll verify this is safe by checking
+      // that no instructions in this block modify the parent register
+      // before we reach the end (where control flows to our block).
+      // If we find any such modification, we'll conservatively bail out.
+      bool SafeToAssume = true;
+      for (const MachineInstr &Inst : *Pred) {
+        for (const MachineOperand &MO : Inst.operands()) {
+          if (MO.isReg() && MO.isDef()) {
+            Register DefReg = MO.getReg();
+            // Check if this modifies Reg or overlaps with it (partial write)
+            if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) &&
+                DefReg != SubReg8 && DefReg != SubReg16) {
+              // Found a write to the parent register or overlapping register
+              // that's not just the sub-register we expect
+              SafeToAssume = false;
+              break;
+            }
+          }
+        }
+        if (!SafeToAssume)
+          break;
+      }
+
+      if (SafeToAssume) {
+        FoundZeroExtend = true;
+        goto next_predecessor;
+      }
+    }
+
+    // Walk backward through predecessor to find last definition of Reg
+    for (const MachineInstr &Inst : llvm::reverse(*Pred)) {
+      // Check if this instruction defines Reg
+      for (const MachineOperand &MO : Inst.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+
+        Register DefReg = MO.getReg();
+        if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+          // Found a definition - check if it zeros upper bits
+          unsigned Opc = Inst.getOpcode();
+          switch (Opc) {
+          case X86::MOVZX32rm8:
+          case X86::MOVZX32rr8:
+          case X86::MOVZX32rm16:
+          case X86::MOVZX32rr16:
+          case X86::MOV32r0:
+          case X86::ADD32rr:
+          case X86::ADD32ri:
+          case X86::ADD32rm:
+          case X86::SUB32rr:
+          case X86::SUB32ri:
+          case X86::SUB32rm:
+          case X86::LEA32r:
+            FoundZeroExtend = true;
+            break;
+          case X86::XOR32rr:
+            if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+              FoundZeroExtend = true;
+            break;
+          default:
+            // Found a definition that doesn't zero upper bits
+            return false;
+          }
+          // Found the definition in this predecessor
+          goto next_predecessor;
+        }
+
+        // Check for partial register updates
+        if (TRI->regsOverlap(DefReg, Reg)) {
+          return false;
+        }
+      }
+    }
+
+  next_predecessor:
+    // If we didn't find a zero-extending definition in this predecessor, fail
+    if (!FoundZeroExtend)
+      return false;
+  }
+
+  // All predecessors have zero-extending definitions
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend(
+    MachineInstr &MI, MachineBasicBlock &MBB) const {
+  unsigned Opc = MI.getOpcode();
+
+  // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later)
+  if (Opc != X86::MOVZX32rr8)
+    return false;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  // Check if source is a sub-register of destination
+  // e.g., CL is sub-register of ECX
+  if (!TRI->isSubRegister(DstReg, SrcReg))
+    return false;
+
+  // Check if destination's upper bits are already zero
+  if (!hasZeroUpperBits(DstReg, MI, MBB))
+    return false;
+
+  // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's
+  // upper bits are already zero, this instruction does nothing.
+  LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Iterate through instructions - use a worklist to handle erasures
+    SmallVector<MachineInstr *, 4> ToErase;
+
+    for (MachineInstr &MI : MBB) {
+      if (tryEliminateRedundantZeroExtend(MI, MBB)) {
+        Changed = true;
+        // Note: MI is already erased in tryEliminateRedundantZeroExtend
+        break; // Restart iteration for this block
+      }
+    }
+  }
+
+  return Changed;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d97ccea..72835150e8277 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() {
 
   if (getOptLevel() != CodeGenOptLevel::None) {
     addPass(createX86FixupBWInsts());
+    addPass(createX86EliminateRedundantZeroExtend());
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
     addPass(createX86FixupInstTuning());
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
new file mode 100644
index 0000000000000..2c9e46e043187
--- /dev/null
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test that redundant MOVZX instructions are eliminated when the source
+; register is a sub-register of the destination and the destination's upper
+; bits are already known to be zero.
+
+; This is the original countholes test case from GitHub issue that demonstrates
+; the redundant movzbl %cl, %ecx in the loop
+define i32 @countholes(ptr %s) {
+; CHECK-LABEL: countholes:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpb $48, %cl
+; CHECK-NEXT:    jb .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %while.body.preheader
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    leaq pre_table(%rip), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addl $-48, %ecx
+; CHECK-NEXT:    addl (%rdx,%rcx,4), %eax
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    cmpb $47, %cl
+; CHECK-NEXT:    ja .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %cleanup
+; CHECK-NEXT:    retq
+entry:
+  %c.0 = load i8, ptr %s, align 1
+  %conv = zext i8 %c.0 to i32
+  %cmp = icmp ult i8 %c.0, 48
+  br i1 %cmp, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ]
+  %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ]
+  %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv3 = zext i8 %c.010 to i64
+  %sub = add nsw i64 %conv3, -48
+  %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add i32 %0, %tot.09
+  %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1
+  %c.1 = load i8, ptr %incdec.ptr, align 1
+  %cmp1 = icmp ult i8 %c.1, 48
+  br i1 %cmp1, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ]
+  ret i32 %retval.0
+}
+
+ at pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4

>From c7b29d9daced40d7589b3c3682820b21d174b9a2 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 04:04:08 -0700
Subject: [PATCH 2/5] Fixing broken test on linux

---
 .../CodeGen/X86/eliminate-redundant-zext.ll   |    9 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |    1 +
 llvm/test/CodeGen/X86/pr38539.ll              |    2 +-
 llvm/test/CodeGen/X86/vector-compress.ll      | 1025 ++++++++---------
 4 files changed, 508 insertions(+), 529 deletions(-)

diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
index 2c9e46e043187..4399841d49876 100644
--- a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -5,7 +5,7 @@
 ; register is a sub-register of the destination and the destination's upper
 ; bits are already known to be zero.
 
-; This is the original countholes test case from GitHub issue that demonstrates
+; This is the original countholes test case from GitHub issue #160710 that demonstrates
 ; the redundant movzbl %cl, %ecx in the loop
 define i32 @countholes(ptr %s) {
 ; CHECK-LABEL: countholes:
@@ -17,12 +17,11 @@ define i32 @countholes(ptr %s) {
 ; CHECK-NEXT:  # %bb.1: # %while.body.preheader
 ; CHECK-NEXT:    incq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    leaq pre_table(%rip), %rdx
-; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:    .p2align 4{{$}}
 ; CHECK-NEXT:  .LBB0_2: # %while.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addl $-48, %ecx
-; CHECK-NEXT:    addl (%rdx,%rcx,4), %eax
+; CHECK-NOT:     movzbl %cl, %ecx
+; CHECK:         addl {{.*}}, %eax
 ; CHECK-NEXT:    movzbl (%rdi), %ecx
 ; CHECK-NEXT:    incq %rdi
 ; CHECK-NEXT:    cmpb $47, %cl
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e59d0d0a..01385fb63d6e1 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -202,6 +202,7 @@
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
+; CHECK-NEXT:       X86 Eliminate Redundant Zero Extension
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Atom pad short functions
 ; CHECK-NEXT:       X86 LEA Fixup
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 412455384e937..b633c28a214b7 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,7 +23,7 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl (%eax), %eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index ac932d51017ae..a9b637931fc9b 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1094,25 +1094,26 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vpextrb $1, %xmm1, %r13d
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    movl %esi, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm1, %r11d
+; AVX2-NEXT:    vmovd %xmm1, %eax
+; AVX2-NEXT:    movzbl %al, %edx
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    subb %r11b, %al
+; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
+; AVX2-NEXT:    subb %sil, %al
+; AVX2-NEXT:    vpextrb $3, %xmm1, %r13d
 ; AVX2-NEXT:    subb %r13b, %al
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
-; AVX2-NEXT:    subb %dl, %al
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ebp
-; AVX2-NEXT:    subb %bpl, %al
 ; AVX2-NEXT:    vpextrb $4, %xmm1, %r12d
 ; AVX2-NEXT:    subb %r12b, %al
 ; AVX2-NEXT:    vpextrb $5, %xmm1, %r15d
 ; AVX2-NEXT:    subb %r15b, %al
 ; AVX2-NEXT:    vpextrb $6, %xmm1, %r14d
 ; AVX2-NEXT:    subb %r14b, %al
-; AVX2-NEXT:    vpextrb $7, %xmm1, %ebx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %ebp
+; AVX2-NEXT:    subb %bpl, %al
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ebx
 ; AVX2-NEXT:    subb %bl, %al
-; AVX2-NEXT:    vpextrb $8, %xmm1, %r11d
-; AVX2-NEXT:    subb %r11b, %al
 ; AVX2-NEXT:    vpextrb $9, %xmm1, %r10d
 ; AVX2-NEXT:    subb %r10b, %al
 ; AVX2-NEXT:    vpextrb $10, %xmm1, %r9d
@@ -1122,94 +1123,108 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpextrb $12, %xmm1, %edi
 ; AVX2-NEXT:    subb %dil, %al
 ; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    movzbl %al, %eax
 ; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzbl -40(%rsp,%rax), %eax
 ; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rsi)
-; AVX2-NEXT:    andl $1, %r13d
-; AVX2-NEXT:    addq %rsi, %r13
-; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%r13)
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %r13, %rdx
-; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT:    andl $1, %ebp
-; AVX2-NEXT:    addq %rdx, %rbp
-; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rbp)
-; AVX2-NEXT:    andl $1, %r12d
-; AVX2-NEXT:    addq %rbp, %r12
-; AVX2-NEXT:    andl $1, %r15d
-; AVX2-NEXT:    addq %r12, %r15
-; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
-; AVX2-NEXT:    andl $15, %r12d
-; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%r12)
-; AVX2-NEXT:    andl $1, %r14d
-; AVX2-NEXT:    addq %r15, %r14
-; AVX2-NEXT:    # kill: def $r15d killed $r15d killed $r15 def $r15
-; AVX2-NEXT:    andl $15, %r15d
-; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%r15)
-; AVX2-NEXT:    andl $1, %ebx
-; AVX2-NEXT:    addq %r14, %rbx
-; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
-; AVX2-NEXT:    andl $15, %r14d
-; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%r14)
-; AVX2-NEXT:    andl $1, %r11d
-; AVX2-NEXT:    addq %rbx, %r11
-; AVX2-NEXT:    # kill: def $ebx killed $ebx killed $rbx def $rbx
-; AVX2-NEXT:    andl $15, %ebx
-; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rbx)
-; AVX2-NEXT:    andl $1, %r10d
-; AVX2-NEXT:    addq %r11, %r10
-; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
-; AVX2-NEXT:    andl $15, %r11d
-; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%r11)
-; AVX2-NEXT:    andl $1, %r9d
-; AVX2-NEXT:    addq %r10, %r9
-; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX2-NEXT:    andl $15, %r10d
-; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%r10)
-; AVX2-NEXT:    andl $1, %r8d
-; AVX2-NEXT:    addq %r9, %r8
-; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
-; AVX2-NEXT:    andl $15, %r9d
-; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%r9)
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    addq %r8, %rdi
-; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
-; AVX2-NEXT:    andl $15, %r8d
-; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%r8)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    addq %rdi, %rsi
-; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX2-NEXT:    andl $15, %edi
-; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rdi)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT:    movzbl %r11b, %eax
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rsi, %rax
-; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX2-NEXT:    andl $15, %esi
-; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rsi)
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %sil, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r13b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r12b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movzbl %r15b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r14b, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    cmpq $15, %rcx
-; AVX2-NEXT:    movl $15, %eax
-; AVX2-NEXT:    cmovbq %rcx, %rax
-; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; AVX2-NEXT:    movb %cl, -40(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %bpl, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %bl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r10b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r9b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r8b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %dil, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    cmpq $15, %rax
+; AVX2-NEXT:    movl $15, %ecx
+; AVX2-NEXT:    cmovbq %rax, %rcx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT:    movb %al, -40(%rsp,%rcx)
 ; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
@@ -1790,137 +1805,140 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $96, %rsp
-; AVX2-NEXT:    movl %r9d, %r11d
-; AVX2-NEXT:    movl %r8d, %r10d
-; AVX2-NEXT:    movl %ecx, %r9d
-; AVX2-NEXT:    movl %edx, %r8d
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %ecx, %r13d
+; AVX2-NEXT:    movl %edx, %r15d
+; AVX2-NEXT:    movl %esi, %ebx
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    movzbl 360(%rbp), %eax
-; AVX2-NEXT:    movzbl 352(%rbp), %ecx
+; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl 360(%rbp), %eax
+; AVX2-NEXT:    movl 352(%rbp), %ecx
 ; AVX2-NEXT:    vmovd %ecx, %xmm4
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 368(%rbp), %eax
+; AVX2-NEXT:    movl 368(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 376(%rbp), %eax
+; AVX2-NEXT:    movl 376(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 384(%rbp), %eax
+; AVX2-NEXT:    movl 384(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 392(%rbp), %eax
+; AVX2-NEXT:    movl 392(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 400(%rbp), %eax
+; AVX2-NEXT:    movl 400(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 408(%rbp), %eax
+; AVX2-NEXT:    movl 408(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 416(%rbp), %eax
+; AVX2-NEXT:    movl 416(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 424(%rbp), %eax
+; AVX2-NEXT:    movl 424(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 432(%rbp), %eax
+; AVX2-NEXT:    movl 432(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 440(%rbp), %eax
+; AVX2-NEXT:    movl 440(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 448(%rbp), %eax
+; AVX2-NEXT:    movl 448(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 456(%rbp), %eax
+; AVX2-NEXT:    movl 456(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 464(%rbp), %eax
+; AVX2-NEXT:    movl 464(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 472(%rbp), %eax
+; AVX2-NEXT:    movl 472(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 224(%rbp), %eax
+; AVX2-NEXT:    movl 224(%rbp), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm5
-; AVX2-NEXT:    movzbl 232(%rbp), %eax
+; AVX2-NEXT:    movl 232(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 240(%rbp), %eax
+; AVX2-NEXT:    movl 240(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 248(%rbp), %eax
+; AVX2-NEXT:    movl 248(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 256(%rbp), %eax
+; AVX2-NEXT:    movl 256(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 264(%rbp), %eax
+; AVX2-NEXT:    movl 264(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 272(%rbp), %eax
+; AVX2-NEXT:    movl 272(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 280(%rbp), %eax
+; AVX2-NEXT:    movl 280(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 288(%rbp), %eax
+; AVX2-NEXT:    movl 288(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 296(%rbp), %eax
+; AVX2-NEXT:    movl 296(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 304(%rbp), %eax
+; AVX2-NEXT:    movl 304(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 312(%rbp), %eax
+; AVX2-NEXT:    movl 312(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 320(%rbp), %eax
+; AVX2-NEXT:    movl 320(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 328(%rbp), %eax
+; AVX2-NEXT:    movl 328(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 336(%rbp), %eax
+; AVX2-NEXT:    movl 336(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 344(%rbp), %eax
+; AVX2-NEXT:    movl 344(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT:    vmovd %edi, %xmm5
-; AVX2-NEXT:    vpinsrb $1, %esi, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $2, %edx, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $3, %r9d, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $4, %r10d, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $5, %r11d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 16(%rbp), %ebx
-; AVX2-NEXT:    vpinsrb $6, %ebx, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 24(%rbp), %r14d
-; AVX2-NEXT:    vpinsrb $7, %r14d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 32(%rbp), %r15d
-; AVX2-NEXT:    vpinsrb $8, %r15d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 40(%rbp), %r12d
-; AVX2-NEXT:    vpinsrb $9, %r12d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 48(%rbp), %r13d
-; AVX2-NEXT:    vpinsrb $10, %r13d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 56(%rbp), %eax
+; AVX2-NEXT:    movl 96(%rbp), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm5
+; AVX2-NEXT:    movl 104(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 112(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 120(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 128(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 136(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 144(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 152(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 160(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 168(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 176(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 184(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 64(%rbp), %eax
+; AVX2-NEXT:    movl 192(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 72(%rbp), %eax
+; AVX2-NEXT:    movl 200(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 80(%rbp), %eax
+; AVX2-NEXT:    movl 208(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 88(%rbp), %eax
+; AVX2-NEXT:    movl 216(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 96(%rbp), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm6
-; AVX2-NEXT:    movzbl 104(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 112(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 120(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 128(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 136(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 144(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 152(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 160(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 168(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 176(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 184(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 192(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 200(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 208(%rbp), %eax
+; AVX2-NEXT:    vmovd %edi, %xmm6
+; AVX2-NEXT:    vpinsrb $1, %esi, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $2, %edx, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $3, %r13d, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $5, %r9d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 16(%rbp), %esi
+; AVX2-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
+; AVX2-NEXT:    movl 24(%rbp), %edi
+; AVX2-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
+; AVX2-NEXT:    movl 32(%rbp), %r8d
+; AVX2-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 40(%rbp), %r9d
+; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 48(%rbp), %r10d
+; AVX2-NEXT:    vpinsrb $10, %r10d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 56(%rbp), %r11d
+; AVX2-NEXT:    vpinsrb $11, %r11d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 64(%rbp), %r14d
+; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 72(%rbp), %r12d
+; AVX2-NEXT:    vpinsrb $13, %r12d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 80(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 216(%rbp), %eax
+; AVX2-NEXT:    movl 88(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
@@ -1962,435 +1980,379 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
 ; AVX2-NEXT:    movzbl %al, %eax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    movzbl (%rsp,%rax), %edx
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %rax, %rbx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rbx)
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %rbx, %r15
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r15)
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %r15, %r13
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r13)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
 ; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    addq %rdi, %rsi
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT:    addq %rax, %rsi
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %rsi, %rdi
+; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT:    andl $63, %esi
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rsi)
 ; AVX2-NEXT:    andl $1, %r8d
-; AVX2-NEXT:    addq %rsi, %r8
-; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r8)
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT:    andl $63, %edi
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdi)
 ; AVX2-NEXT:    andl $1, %r9d
 ; AVX2-NEXT:    addq %r8, %r9
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r9)
+; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT:    andl $63, %r8d
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%r8)
 ; AVX2-NEXT:    andl $1, %r10d
 ; AVX2-NEXT:    addq %r9, %r10
-; AVX2-NEXT:    movl %r10d, %eax
-; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT:    andl $63, %r9d
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%r9)
 ; AVX2-NEXT:    andl $1, %r11d
 ; AVX2-NEXT:    addq %r10, %r11
-; AVX2-NEXT:    movzbl %bl, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT:    andl $63, %r10d
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%r10)
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %r11, %r14
 ; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
 ; AVX2-NEXT:    andl $63, %r11d
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%r11)
-; AVX2-NEXT:    movzbl %r14b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl %r15b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl %r12b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl %r13b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 56(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 64(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 72(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 80(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%r11)
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %r14, %r12
+; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT:    andl $63, %r14d
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%r14)
+; AVX2-NEXT:    movl 80(%rbp), %eax
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 88(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    addq %r12, %rax
+; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT:    andl $63, %r12d
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%r12)
+; AVX2-NEXT:    movl 88(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 96(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    movl 96(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 104(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 104(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 112(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 120(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 112(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 120(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 128(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 136(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 128(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 136(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 144(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 152(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 144(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 152(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 160(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 168(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 160(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 168(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 176(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 184(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 176(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 184(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 192(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 200(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 192(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 200(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 208(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 216(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 208(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 216(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 224(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 232(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 224(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 232(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 240(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 248(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 240(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 248(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 256(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 264(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 256(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 264(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 272(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 280(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 272(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 280(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 288(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 296(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 288(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 296(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 304(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 312(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 304(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 312(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 320(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 328(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 320(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 328(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 336(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 344(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 336(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 344(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 352(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    movl 352(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 360(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 360(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 368(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 376(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 368(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 376(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 384(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 392(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 384(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 392(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 400(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 408(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 400(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 408(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 416(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 424(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 416(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 424(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 432(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 440(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 432(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 440(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 448(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 456(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 448(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 456(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 464(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 472(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 464(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 472(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
 ; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX2-NEXT:    cmpq $64, %rcx
-; AVX2-NEXT:    cmovbl %edx, %eax
+; AVX2-NEXT:    cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
 ; AVX2-NEXT:    cmpq $63, %rcx
-; AVX2-NEXT:    movl $63, %edx
-; AVX2-NEXT:    cmovbq %rcx, %rdx
-; AVX2-NEXT:    movb %al, (%rsp,%rdx)
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    movl $63, %ecx
+; AVX2-NEXT:    cmovbq %rdx, %rcx
+; AVX2-NEXT:    movb %al, (%rsp,%rcx)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    leaq -40(%rbp), %rsp
@@ -4499,8 +4461,9 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
 ; AVX2-NEXT:    cmovbq %r11, %rax
 ; AVX2-NEXT:    movl %eax, %eax
 ; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT:    vmovaps (%rsp), %ymm0
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
 ; AVX2-NEXT:    leaq -8(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %rbp
@@ -4511,18 +4474,20 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
   %xvec = zext <8 x i16> %vec to <8 x i64> ;  0 -> 65535
   %xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
@@ -4603,8 +4568,18 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
 ; AVX2-NEXT:    cmovbq %r11, %rax
 ; AVX2-NEXT:    movl %eax, %eax
 ; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT:    vmovaps (%rsp), %ymm0
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    vmovdqa (%rsp), %ymm0
+; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    vpsllq $48, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm2
+; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpsllq $48, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm2
+; AVX2-NEXT:    vpslld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX2-NEXT:    leaq -8(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %rbp
@@ -4615,18 +4590,22 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm1
-; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpsllq $48, %zmm1, %zmm0
+; AVX512F-NEXT:    vpsraq $48, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm1
-; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpsllq $48, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpsraq $48, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
   %xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
   %xpassthru = and <8 x i64> %passthru, splat(i64 3)

>From ae48367bdc111c0951107418831707b6bb59c15f Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 10:50:32 -0700
Subject: [PATCH 3/5] Addressing latest review

---
 llvm/test/CodeGen/X86/eliminate-redundant-zext.ll | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
index 4399841d49876..294a6e7f780e3 100644
--- a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -17,11 +17,10 @@ define i32 @countholes(ptr %s) {
 ; CHECK-NEXT:  # %bb.1: # %while.body.preheader
 ; CHECK-NEXT:    incq %rdi
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    .p2align 4{{$}}
+; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2: # %while.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NOT:     movzbl %cl, %ecx
-; CHECK:         addl {{.*}}, %eax
+; CHECK-NEXT:    addl pre_table-192(,%rcx,4), %eax
 ; CHECK-NEXT:    movzbl (%rdi), %ecx
 ; CHECK-NEXT:    incq %rdi
 ; CHECK-NEXT:    cmpb $47, %cl

>From 7d9e8ec6ad93b3fad8d6ed0f3c7b17ac5314b889 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 15:31:11 -0700
Subject: [PATCH 4/5] Fixing the tests.

---
 llvm/test/CodeGen/X86/atomic-rm-bit-test.ll | 10 ----------
 llvm/test/CodeGen/X86/ctlz.ll               |  2 --
 llvm/test/CodeGen/X86/isel-select-cmov.ll   |  4 ----
 llvm/test/CodeGen/X86/isel-udiv.ll          |  1 -
 llvm/test/CodeGen/X86/isel-urem.ll          |  1 -
 llvm/test/CodeGen/X86/popcnt.ll             |  2 --
 llvm/test/CodeGen/X86/sttni.ll              |  5 -----
 7 files changed, 25 deletions(-)

diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index b4d40fee01e41..a283a002d9818 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -177,7 +177,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-NEXT:    lock cmpxchgb %cl, (%esi)
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
@@ -198,7 +197,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
 ; X64-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; X64-NEXT:    jne .LBB3_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    testl %eax, %edx
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
@@ -233,7 +231,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw
 ; X86-NEXT:    lock cmpxchgb %cl, (%esi)
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movzbl %dl, %ecx
 ; X86-NEXT:    btl %ecx, %eax
 ; X86-NEXT:    setae %al
@@ -255,7 +252,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw
 ; X64-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; X64-NEXT:    jne .LBB4_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    btl %ecx, %eax
 ; X64-NEXT:    setae %al
@@ -291,7 +287,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun
 ; X86-NEXT:    lock cmpxchgb %cl, (%edx)
 ; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    testl %eax, %ebx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %ebx
@@ -313,7 +308,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun
 ; X64-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; X64-NEXT:    jne .LBB5_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    testl %eax, %edx
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
@@ -349,7 +343,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-NEXT:    lock cmpxchgb %ch, (%edx)
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    testl %eax, %ebx
 ; X86-NEXT:    je .LBB6_3
 ; X86-NEXT:  # %bb.4: # %if.then
@@ -378,7 +371,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
 ; X64-NEXT:    lock cmpxchgb %r8b, (%rdi)
 ; X64-NEXT:    jne .LBB6_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    testl %eax, %edx
 ; X64-NEXT:    je .LBB6_3
 ; X64-NEXT:  # %bb.4: # %if.then
@@ -512,7 +504,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun
 ; X86-NEXT:    testl %ecx, %ebx
 ; X86-NEXT:    je .LBB8_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movzbl %ah, %eax
 ; X86-NEXT:    movzbl (%edx,%eax), %eax
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
@@ -538,7 +529,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun
 ; X64-NEXT:    lock cmpxchgb %r8b, (%rdi)
 ; X64-NEXT:    jne .LBB8_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    testl %eax, %edx
 ; X64-NEXT:    je .LBB8_3
 ; X64-NEXT:  # %bb.4: # %if.then
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 1267fe9033454..a3d28a7fcba24 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -224,7 +224,6 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-NOCMOV-NEXT:    testb %al, %al
 ; X86-NOCMOV-NEXT:    je .LBB4_1
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
-; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $7, %eax
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
@@ -961,7 +960,6 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-NOCMOV-NEXT:    testb %al, %al
 ; X86-NOCMOV-NEXT:    je .LBB16_1
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
-; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $7, %eax
 ; X86-NOCMOV-NEXT:    xorb $7, %al
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index d013ad2c7fbff..783db3487e2bd 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -73,11 +73,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
 ; FAST-X86-NEXT:    jne LBB0_1
 ; FAST-X86-NEXT:  ## %bb.2:
 ; FAST-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-NEXT:    movzbl %al, %eax
 ; FAST-X86-NEXT:    retl
 ; FAST-X86-NEXT:  LBB0_1:
 ; FAST-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-NEXT:    movzbl %al, %eax
 ; FAST-X86-NEXT:    retl
 ;
 ; FAST-X86-CMOV-LABEL: select_cmov_i8:
@@ -86,11 +84,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
 ; FAST-X86-CMOV-NEXT:    jne LBB0_1
 ; FAST-X86-CMOV-NEXT:  ## %bb.2:
 ; FAST-X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-CMOV-NEXT:    movzbl %al, %eax
 ; FAST-X86-CMOV-NEXT:    retl
 ; FAST-X86-CMOV-NEXT:  LBB0_1:
 ; FAST-X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-CMOV-NEXT:    movzbl %al, %eax
 ; FAST-X86-CMOV-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: select_cmov_i8:
diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll
index b123b3c7780fa..f96a12c2fafd0 100644
--- a/llvm/test/CodeGen/X86/isel-udiv.ll
+++ b/llvm/test/CodeGen/X86/isel-udiv.ll
@@ -22,7 +22,6 @@ define i8 @test_udiv_i8(i8 %arg1, i8 %arg2) nounwind {
 ; GISEL-X86-LABEL: test_udiv_i8:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT:    movzbl %al, %eax
 ; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; GISEL-X86-NEXT:    divb %cl
 ; GISEL-X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll
index 386f08151ad9c..5dd901fe8daa6 100644
--- a/llvm/test/CodeGen/X86/isel-urem.ll
+++ b/llvm/test/CodeGen/X86/isel-urem.ll
@@ -49,7 +49,6 @@ define i8 @test_urem_i8(i8 %arg1, i8 %arg2) nounwind {
 ; GISEL-X86-LABEL: test_urem_i8:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT:    movzbl %al, %eax
 ; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; GISEL-X86-NEXT:    divb %cl
 ; GISEL-X86-NEXT:    movb %ah, %al
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 3004b8b72fcc5..cd5edffd8ccda 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -67,7 +67,6 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -1840,7 +1839,6 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-BASE-LABEL: popcount_i16_zext:
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 39cbee54737c3..b0c92831124bf 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -89,7 +89,6 @@ define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs,
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB2_2: # %compare
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -222,7 +221,6 @@ define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB5_2: # %compare
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
@@ -552,7 +550,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
 ; X86-NEXT:    jne .LBB14_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB14_2: # %compare
 ; X86-NEXT:    pushl %ebp
@@ -577,7 +574,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
 ; X64-NEXT:    jne .LBB14_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB14_2: # %compare
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -690,7 +686,6 @@ define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X64-NEXT:    jne .LBB17_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB17_2: # %compare
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)

>From 0d9c3131428f6b7c5dfdc185d97ed60dd3ddcadd Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 20:22:17 -0700
Subject: [PATCH 5/5] Fixing 2 failing tests.

---
 llvm/test/CodeGen/X86/pr38539.ll         |    3 +-
 llvm/test/CodeGen/X86/vector-compress.ll | 1027 +++++++++++-----------
 2 files changed, 524 insertions(+), 506 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index b633c28a214b7..147abcdbff0b9 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,12 +23,11 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %ecx
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    divb %cl
 ; X86-NEXT:    movl %edi, %eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index a9b637931fc9b..53e6e49268789 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1094,26 +1094,25 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vpextrb $1, %xmm1, %r11d
-; AVX2-NEXT:    vmovd %xmm1, %eax
-; AVX2-NEXT:    movzbl %al, %edx
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    vpextrb $1, %xmm1, %r13d
+; AVX2-NEXT:    vmovd %xmm1, %esi
+; AVX2-NEXT:    movl %esi, %eax
 ; AVX2-NEXT:    andb $1, %al
-; AVX2-NEXT:    subb %r11b, %al
-; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
-; AVX2-NEXT:    subb %sil, %al
-; AVX2-NEXT:    vpextrb $3, %xmm1, %r13d
 ; AVX2-NEXT:    subb %r13b, %al
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    subb %dl, %al
+; AVX2-NEXT:    vpextrb $3, %xmm1, %ebp
+; AVX2-NEXT:    subb %bpl, %al
 ; AVX2-NEXT:    vpextrb $4, %xmm1, %r12d
 ; AVX2-NEXT:    subb %r12b, %al
 ; AVX2-NEXT:    vpextrb $5, %xmm1, %r15d
 ; AVX2-NEXT:    subb %r15b, %al
 ; AVX2-NEXT:    vpextrb $6, %xmm1, %r14d
 ; AVX2-NEXT:    subb %r14b, %al
-; AVX2-NEXT:    vpextrb $7, %xmm1, %ebp
-; AVX2-NEXT:    subb %bpl, %al
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ebx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %ebx
 ; AVX2-NEXT:    subb %bl, %al
+; AVX2-NEXT:    vpextrb $8, %xmm1, %r11d
+; AVX2-NEXT:    subb %r11b, %al
 ; AVX2-NEXT:    vpextrb $9, %xmm1, %r10d
 ; AVX2-NEXT:    subb %r10b, %al
 ; AVX2-NEXT:    vpextrb $10, %xmm1, %r9d
@@ -1123,108 +1122,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpextrb $12, %xmm1, %edi
 ; AVX2-NEXT:    subb %dil, %al
 ; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    subb %cl, %al
 ; AVX2-NEXT:    movzbl %al, %eax
 ; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzbl -40(%rsp,%rax), %eax
 ; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rsi)
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %rsi, %r13
+; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%r13)
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT:    movzbl %r11b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rdx, %rax
-; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl %sil, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl %r13b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl %r12b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    movzbl %r15b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl %r14b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl %bpl, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl %bl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl %r10b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl %r9b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl %r8b, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl %dil, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT:    addq %r13, %rdx
+; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT:    andl $1, %ebp
+; AVX2-NEXT:    addq %rdx, %rbp
+; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rbp)
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %rbp, %r12
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %r12, %r15
+; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT:    andl $15, %r12d
+; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%r12)
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %r15, %r14
+; AVX2-NEXT:    # kill: def $r15d killed $r15d killed $r15 def $r15
+; AVX2-NEXT:    andl $15, %r15d
+; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%r15)
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %r14, %rbx
+; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT:    andl $15, %r14d
+; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%r14)
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %rbx, %r11
+; AVX2-NEXT:    # kill: def $ebx killed $ebx killed $rbx def $rbx
+; AVX2-NEXT:    andl $15, %ebx
+; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rbx)
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r11, %r10
+; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT:    andl $15, %r11d
+; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%r11)
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r10, %r9
+; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT:    andl $15, %r10d
+; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%r10)
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %r9, %r8
+; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%r9)
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%r8)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rdi, %rsi
+; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT:    andl $15, %edi
+; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rdi)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT:    addq %rsi, %rax
+; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rsi)
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT:    cmpq $15, %rax
-; AVX2-NEXT:    movl $15, %ecx
-; AVX2-NEXT:    cmovbq %rax, %rcx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; AVX2-NEXT:    movb %al, -40(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    cmpq $15, %rcx
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    cmovbq %rcx, %rax
+; AVX2-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; AVX2-NEXT:    movb %cl, -40(%rsp,%rax)
 ; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
@@ -1805,140 +1790,137 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $128, %rsp
-; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl %ecx, %r13d
-; AVX2-NEXT:    movl %edx, %r15d
-; AVX2-NEXT:    movl %esi, %ebx
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    movl %r9d, %r11d
+; AVX2-NEXT:    movl %r8d, %r10d
+; AVX2-NEXT:    movl %ecx, %r9d
+; AVX2-NEXT:    movl %edx, %r8d
+; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl 360(%rbp), %eax
-; AVX2-NEXT:    movl 352(%rbp), %ecx
+; AVX2-NEXT:    movzbl 360(%rbp), %eax
+; AVX2-NEXT:    movzbl 352(%rbp), %ecx
 ; AVX2-NEXT:    vmovd %ecx, %xmm4
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 368(%rbp), %eax
+; AVX2-NEXT:    movzbl 368(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 376(%rbp), %eax
+; AVX2-NEXT:    movzbl 376(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 384(%rbp), %eax
+; AVX2-NEXT:    movzbl 384(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 392(%rbp), %eax
+; AVX2-NEXT:    movzbl 392(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 400(%rbp), %eax
+; AVX2-NEXT:    movzbl 400(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 408(%rbp), %eax
+; AVX2-NEXT:    movzbl 408(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 416(%rbp), %eax
+; AVX2-NEXT:    movzbl 416(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 424(%rbp), %eax
+; AVX2-NEXT:    movzbl 424(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 432(%rbp), %eax
+; AVX2-NEXT:    movzbl 432(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 440(%rbp), %eax
+; AVX2-NEXT:    movzbl 440(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 448(%rbp), %eax
+; AVX2-NEXT:    movzbl 448(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 456(%rbp), %eax
+; AVX2-NEXT:    movzbl 456(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 464(%rbp), %eax
+; AVX2-NEXT:    movzbl 464(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 472(%rbp), %eax
+; AVX2-NEXT:    movzbl 472(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movl 224(%rbp), %eax
+; AVX2-NEXT:    movzbl 224(%rbp), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm5
-; AVX2-NEXT:    movl 232(%rbp), %eax
+; AVX2-NEXT:    movzbl 232(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 240(%rbp), %eax
+; AVX2-NEXT:    movzbl 240(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 248(%rbp), %eax
+; AVX2-NEXT:    movzbl 248(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 256(%rbp), %eax
+; AVX2-NEXT:    movzbl 256(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 264(%rbp), %eax
+; AVX2-NEXT:    movzbl 264(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 272(%rbp), %eax
+; AVX2-NEXT:    movzbl 272(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 280(%rbp), %eax
+; AVX2-NEXT:    movzbl 280(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 288(%rbp), %eax
+; AVX2-NEXT:    movzbl 288(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 296(%rbp), %eax
+; AVX2-NEXT:    movzbl 296(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 304(%rbp), %eax
+; AVX2-NEXT:    movzbl 304(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 312(%rbp), %eax
+; AVX2-NEXT:    movzbl 312(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 320(%rbp), %eax
+; AVX2-NEXT:    movzbl 320(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 328(%rbp), %eax
+; AVX2-NEXT:    movzbl 328(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 336(%rbp), %eax
+; AVX2-NEXT:    movzbl 336(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 344(%rbp), %eax
+; AVX2-NEXT:    movzbl 344(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT:    movl 96(%rbp), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm5
-; AVX2-NEXT:    movl 104(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 112(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 120(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 128(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 136(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 144(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 152(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 160(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 168(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 176(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 184(%rbp), %eax
+; AVX2-NEXT:    vmovd %edi, %xmm5
+; AVX2-NEXT:    vpinsrb $1, %esi, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $2, %edx, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $3, %r9d, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $4, %r10d, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $5, %r11d, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 16(%rbp), %ebx
+; AVX2-NEXT:    vpinsrb $6, %ebx, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 24(%rbp), %r14d
+; AVX2-NEXT:    vpinsrb $7, %r14d, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 32(%rbp), %r15d
+; AVX2-NEXT:    vpinsrb $8, %r15d, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 40(%rbp), %r12d
+; AVX2-NEXT:    vpinsrb $9, %r12d, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 48(%rbp), %r13d
+; AVX2-NEXT:    vpinsrb $10, %r13d, %xmm5, %xmm5
+; AVX2-NEXT:    movzbl 56(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 192(%rbp), %eax
+; AVX2-NEXT:    movzbl 64(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 200(%rbp), %eax
+; AVX2-NEXT:    movzbl 72(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 208(%rbp), %eax
+; AVX2-NEXT:    movzbl 80(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movl 216(%rbp), %eax
+; AVX2-NEXT:    movzbl 88(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    vmovd %edi, %xmm6
-; AVX2-NEXT:    vpinsrb $1, %esi, %xmm6, %xmm6
-; AVX2-NEXT:    vpinsrb $2, %edx, %xmm6, %xmm6
-; AVX2-NEXT:    vpinsrb $3, %r13d, %xmm6, %xmm6
-; AVX2-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
-; AVX2-NEXT:    vpinsrb $5, %r9d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 16(%rbp), %esi
-; AVX2-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
-; AVX2-NEXT:    movl 24(%rbp), %edi
-; AVX2-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
-; AVX2-NEXT:    movl 32(%rbp), %r8d
-; AVX2-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 40(%rbp), %r9d
-; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 48(%rbp), %r10d
-; AVX2-NEXT:    vpinsrb $10, %r10d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 56(%rbp), %r11d
-; AVX2-NEXT:    vpinsrb $11, %r11d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 64(%rbp), %r14d
-; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 72(%rbp), %r12d
-; AVX2-NEXT:    vpinsrb $13, %r12d, %xmm6, %xmm6
-; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    movzbl 96(%rbp), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm6
+; AVX2-NEXT:    movzbl 104(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 112(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 120(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 128(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 136(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 144(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 152(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $7, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 160(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 168(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $9, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 176(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 184(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $11, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 192(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $12, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 200(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $13, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movzbl 208(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movl 88(%rbp), %eax
+; AVX2-NEXT:    movzbl 216(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
@@ -1980,379 +1962,434 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
 ; AVX2-NEXT:    movzbl %al, %eax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
-; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    movzbl (%rsp,%rax), %edx
 ; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    andl $1, %ebx
-; AVX2-NEXT:    addq %rax, %rbx
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rbx)
-; AVX2-NEXT:    andl $1, %r15d
-; AVX2-NEXT:    addq %rbx, %r15
-; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r15)
-; AVX2-NEXT:    andl $1, %r13d
-; AVX2-NEXT:    addq %r15, %r13
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r13)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %r13, %rcx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    addq %rax, %rsi
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
 ; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    addq %rsi, %rdi
-; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX2-NEXT:    andl $63, %esi
-; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rdi, %rsi
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rsi)
 ; AVX2-NEXT:    andl $1, %r8d
-; AVX2-NEXT:    addq %rdi, %r8
-; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX2-NEXT:    andl $63, %edi
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT:    addq %rsi, %r8
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r8)
 ; AVX2-NEXT:    andl $1, %r9d
 ; AVX2-NEXT:    addq %r8, %r9
-; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
-; AVX2-NEXT:    andl $63, %r8d
-; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r9)
 ; AVX2-NEXT:    andl $1, %r10d
 ; AVX2-NEXT:    addq %r9, %r10
-; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
-; AVX2-NEXT:    andl $63, %r9d
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT:    movl %r10d, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
 ; AVX2-NEXT:    andl $1, %r11d
 ; AVX2-NEXT:    addq %r10, %r11
-; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX2-NEXT:    andl $63, %r10d
-; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%r10)
-; AVX2-NEXT:    andl $1, %r14d
-; AVX2-NEXT:    addq %r11, %r14
+; AVX2-NEXT:    movzbl %bl, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %r11, %rax
 ; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
 ; AVX2-NEXT:    andl $63, %r11d
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%r11)
-; AVX2-NEXT:    andl $1, %r12d
-; AVX2-NEXT:    addq %r14, %r12
-; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
-; AVX2-NEXT:    andl $63, %r14d
-; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%r14)
-; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%r11)
+; AVX2-NEXT:    movzbl %r14b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl %r15b, %eax
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %r12, %rax
-; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
-; AVX2-NEXT:    andl $63, %r12d
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%r12)
-; AVX2-NEXT:    movl 88(%rbp), %ecx
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r12b, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 96(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl %r13b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 56(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl 64(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 72(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl 80(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 88(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl 96(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 104(%rbp), %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 104(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 112(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 120(%rbp), %ecx
+; AVX2-NEXT:    movzbl 112(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 120(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 128(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 136(%rbp), %ecx
+; AVX2-NEXT:    movzbl 128(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 136(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 144(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 152(%rbp), %ecx
+; AVX2-NEXT:    movzbl 144(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 152(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 160(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 168(%rbp), %ecx
+; AVX2-NEXT:    movzbl 160(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 168(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 176(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 184(%rbp), %ecx
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movzbl 176(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 184(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 192(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 200(%rbp), %ecx
+; AVX2-NEXT:    movzbl 192(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 200(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 208(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 216(%rbp), %ecx
+; AVX2-NEXT:    movzbl 208(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 216(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 224(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 232(%rbp), %ecx
+; AVX2-NEXT:    movzbl 224(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 232(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 240(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 248(%rbp), %ecx
+; AVX2-NEXT:    movzbl 240(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 248(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 256(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 264(%rbp), %ecx
+; AVX2-NEXT:    movzbl 256(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 264(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 272(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 280(%rbp), %ecx
+; AVX2-NEXT:    movzbl 272(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 280(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 288(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 296(%rbp), %ecx
+; AVX2-NEXT:    movzbl 288(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 296(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 304(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 312(%rbp), %ecx
+; AVX2-NEXT:    movzbl 304(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 312(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 320(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 328(%rbp), %ecx
+; AVX2-NEXT:    movzbl 320(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 328(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 336(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 344(%rbp), %ecx
+; AVX2-NEXT:    movzbl 336(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 344(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movl 352(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    movzbl 352(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 360(%rbp), %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 360(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 368(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 376(%rbp), %ecx
+; AVX2-NEXT:    movzbl 368(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 376(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 384(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 392(%rbp), %ecx
+; AVX2-NEXT:    movzbl 384(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 392(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 400(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 408(%rbp), %ecx
+; AVX2-NEXT:    movzbl 400(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 408(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 416(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 424(%rbp), %ecx
+; AVX2-NEXT:    movzbl 416(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 424(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 432(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 440(%rbp), %ecx
+; AVX2-NEXT:    movzbl 432(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 440(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 448(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 456(%rbp), %ecx
+; AVX2-NEXT:    movzbl 448(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 456(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 464(%rbp), %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %rcx, %rdx
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movl 472(%rbp), %ecx
+; AVX2-NEXT:    movzbl 464(%rbp), %eax
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    movzbl 472(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
 ; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX2-NEXT:    cmpq $64, %rcx
-; AVX2-NEXT:    cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT:    cmovbl %edx, %eax
 ; AVX2-NEXT:    cmpq $63, %rcx
-; AVX2-NEXT:    movq %rcx, %rdx
-; AVX2-NEXT:    movl $63, %ecx
-; AVX2-NEXT:    cmovbq %rdx, %rcx
-; AVX2-NEXT:    movb %al, (%rsp,%rcx)
+; AVX2-NEXT:    movl $63, %edx
+; AVX2-NEXT:    cmovbq %rcx, %rdx
+; AVX2-NEXT:    movb %al, (%rsp,%rdx)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    leaq -40(%rbp), %rsp
@@ -3310,7 +3347,6 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX2-NEXT:    addl %r8d, %r9d
 ; AVX2-NEXT:    movzbl 16(%rbp), %ecx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r9,4)
-; AVX2-NEXT:    movzbl %cl, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %r9d, %ecx
 ; AVX2-NEXT:    movzbl 24(%rbp), %edx
@@ -4461,9 +4497,8 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
 ; AVX2-NEXT:    cmovbq %r11, %rax
 ; AVX2-NEXT:    movl %eax, %eax
 ; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    leaq -8(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %rbp
@@ -4474,20 +4509,18 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %xvec = zext <8 x i16> %vec to <8 x i64> ;  0 -> 65535
   %xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
@@ -4568,18 +4601,8 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
 ; AVX2-NEXT:    cmovbq %r11, %rax
 ; AVX2-NEXT:    movl %eax, %eax
 ; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT:    vmovdqa (%rsp), %ymm0
-; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT:    vpsllq $48, %ymm0, %ymm2
-; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm2
-; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpsllq $48, %ymm1, %ymm2
-; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm2
-; AVX2-NEXT:    vpslld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    leaq -8(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %rbp
@@ -4590,22 +4613,18 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpsllq $48, %zmm1, %zmm0
-; AVX512F-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm1
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT:    vpsllq $48, %zmm1, %zmm0
-; AVX512VL-NEXT:    vpsraq $48, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm1
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
   %xpassthru = and <8 x i64> %passthru, splat(i64 3)



More information about the llvm-commits mailing list