[llvm] [X86] Eliminate redundant zero-extension instructions (PR #161401)
Francisco Geiman Thiesen via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 20:22:30 PDT 2025
https://github.com/FranciscoThiesen updated https://github.com/llvm/llvm-project/pull/161401
>From 622f767197a08e7a2fd307b7a4c9652540d25432 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Tue, 30 Sep 2025 09:52:34 -0700
Subject: [PATCH 1/5] [X86] Eliminate redundant zero-extension instructions
This pass eliminates redundant MOVZX32rr8 instructions when the source
register is a sub-register of the destination and the destination's upper
bits are already known to be zero.
For example, in loops processing byte values:
```
movzbl (%rdi), %ecx ; ECX upper 24 bits are zero
...
movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0
```
The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)
This commonly occurs in loops that process byte values, saving one
instruction per loop iteration and reducing code size by 3 bytes.
---
llvm/lib/Target/X86/CMakeLists.txt | 1 +
llvm/lib/Target/X86/X86.h | 4 +
.../X86/X86EliminateRedundantZeroExtend.cpp | 292 ++++++++++++++++++
llvm/lib/Target/X86/X86TargetMachine.cpp | 1 +
.../CodeGen/X86/eliminate-redundant-zext.ll | 63 ++++
5 files changed, 361 insertions(+)
create mode 100644 llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
create mode 100644 llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..351ba623e2b6d 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
X86FixupVectorConstants.cpp
X86AvoidStoreForwardingBlocks.cpp
X86DynAllocaExpander.cpp
+ X86EliminateRedundantZeroExtend.cpp
X86FixupSetCC.cpp
X86FlagsCopyLowering.cpp
X86FloatingPoint.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6261fadf10a7a..cd59eb5c80149 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass();
/// the upper portions of registers, and to save code size.
FunctionPass *createX86FixupBWInsts();
+/// Return a Machine IR pass that eliminates redundant zero-extension
+/// instructions where the upper bits are already known to be zero.
+FunctionPass *createX86EliminateRedundantZeroExtend();
+
/// Return a Machine IR pass that reassigns instruction chains from one domain
/// to another, when profitable.
FunctionPass *createX86DomainReassignmentPass();
diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
new file mode 100644
index 0000000000000..72717b1c64794
--- /dev/null
+++ b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
@@ -0,0 +1,292 @@
+//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass eliminates redundant zero-extension instructions where the source
+/// register is a sub-register of the destination and the destination's upper
+/// bits are known to be zero.
+///
+/// For example:
+/// movzbl (%rdi), %ecx ; ECX = zero-extend byte, upper 24 bits are zero
+/// ...
+/// movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0
+///
+/// This pattern commonly occurs in loops processing byte values.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-eliminate-zext"
+#define PASS_NAME "X86 Eliminate Redundant Zero Extension"
+
+namespace {
+class EliminateRedundantZeroExtend : public MachineFunctionPass {
+public:
+ static char ID;
+ EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return PASS_NAME; }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().setNoVRegs();
+ }
+
+private:
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+
+ /// Check if the register's upper bits are known to be zero at this point.
+ /// This checks backward from MI to find the most recent definition of Reg.
+ bool hasZeroUpperBits(Register Reg, const MachineInstr &MI,
+ const MachineBasicBlock &MBB) const;
+
+ /// Try to eliminate a redundant MOVZX instruction.
+ bool tryEliminateRedundantZeroExtend(MachineInstr &MI,
+ MachineBasicBlock &MBB) const;
+};
+
+char EliminateRedundantZeroExtend::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86EliminateRedundantZeroExtend() {
+ return new EliminateRedundantZeroExtend();
+}
+
+bool EliminateRedundantZeroExtend::hasZeroUpperBits(
+ Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const {
+ // Walk backward from MI to find the most recent definition of Reg
+ MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator();
+ MachineBasicBlock::const_reverse_iterator E = MBB.rend();
+ for (; I != E; ++I) {
+ const MachineInstr &Inst = *I;
+
+ // Check if this instruction defines Reg
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ Register DefReg = MO.getReg();
+ if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+ // Found a definition - check if it zeros upper bits
+ unsigned Opc = Inst.getOpcode();
+ switch (Opc) {
+ // These instructions zero-extend to 32 bits
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rr16:
+ return true;
+ // XOR with self zeros the register
+ case X86::XOR32rr:
+ if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+ return true;
+ return false;
+ // MOV32r0 explicitly zeros
+ case X86::MOV32r0:
+ return true;
+ // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit)
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32rm:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32rm:
+ case X86::LEA32r:
+ return true;
+ default:
+ // Any other definition might set upper bits, so not safe
+ return false;
+ }
+ }
+
+ // Check if this instruction modifies Reg (partial write or implicit use)
+ if (TRI->regsOverlap(DefReg, Reg)) {
+ // Partial register update - upper bits are unknown
+ return false;
+ }
+ }
+
+ // Check for implicit defs
+ for (const MachineOperand &MO : Inst.implicit_operands()) {
+ if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) {
+ return false;
+ }
+ }
+ }
+
+ // Didn't find a definition in this block - check predecessors
+ // If all predecessors define Reg with zero upper bits, it's safe
+ if (MBB.pred_empty())
+ return false;
+
+ // Check all predecessor blocks
+ for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+ bool FoundZeroExtend = false;
+
+ // SAFETY CHECK: If the sub-register is live-in to the predecessor,
+ // we make the CONSERVATIVE assumption that the parent register was
+ // zero-extended in an earlier block.
+ //
+ // This is safe because:
+ // 1. After register allocation, if $cl is live-in but $ecx is not,
+ // it means only the low 8 bits are meaningful
+ // 2. The register allocator ensures no other code modifies $ecx between
+ // the zero-extension and this point (otherwise $ecx would be live)
+ // 3. Any write to $ch or upper bits would show as a def of $ecx, which
+ // would be found in our backward scan below and handled correctly
+ //
+ // However, this is still conservative - we should verify the actual
+ // definition to be completely safe.
+ Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit);
+ Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit);
+ bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) ||
+ (SubReg16 && Pred->isLiveIn(SubReg16));
+
+ if (SubRegLiveIn) {
+ // Sub-register is live-in. We'll verify this is safe by checking
+ // that no instructions in this block modify the parent register
+ // before we reach the end (where control flows to our block).
+ // If we find any such modification, we'll conservatively bail out.
+ bool SafeToAssume = true;
+ for (const MachineInstr &Inst : *Pred) {
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (MO.isReg() && MO.isDef()) {
+ Register DefReg = MO.getReg();
+ // Check if this modifies Reg or overlaps with it (partial write)
+ if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) &&
+ DefReg != SubReg8 && DefReg != SubReg16) {
+ // Found a write to the parent register or overlapping register
+ // that's not just the sub-register we expect
+ SafeToAssume = false;
+ break;
+ }
+ }
+ }
+ if (!SafeToAssume)
+ break;
+ }
+
+ if (SafeToAssume) {
+ FoundZeroExtend = true;
+ goto next_predecessor;
+ }
+ }
+
+ // Walk backward through predecessor to find last definition of Reg
+ for (const MachineInstr &Inst : llvm::reverse(*Pred)) {
+ // Check if this instruction defines Reg
+ for (const MachineOperand &MO : Inst.operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+
+ Register DefReg = MO.getReg();
+ if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+ // Found a definition - check if it zeros upper bits
+ unsigned Opc = Inst.getOpcode();
+ switch (Opc) {
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rr16:
+ case X86::MOV32r0:
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32rm:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32rm:
+ case X86::LEA32r:
+ FoundZeroExtend = true;
+ break;
+ case X86::XOR32rr:
+ if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+ FoundZeroExtend = true;
+ break;
+ default:
+ // Found a definition that doesn't zero upper bits
+ return false;
+ }
+ // Found the definition in this predecessor
+ goto next_predecessor;
+ }
+
+ // Check for partial register updates
+ if (TRI->regsOverlap(DefReg, Reg)) {
+ return false;
+ }
+ }
+ }
+
+ next_predecessor:
+ // If we didn't find a zero-extending definition in this predecessor, fail
+ if (!FoundZeroExtend)
+ return false;
+ }
+
+ // All predecessors have zero-extending definitions
+ return true;
+}
+
+bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend(
+ MachineInstr &MI, MachineBasicBlock &MBB) const {
+ unsigned Opc = MI.getOpcode();
+
+ // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later)
+ if (Opc != X86::MOVZX32rr8)
+ return false;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Check if source is a sub-register of destination
+ // e.g., CL is sub-register of ECX
+ if (!TRI->isSubRegister(DstReg, SrcReg))
+ return false;
+
+ // Check if destination's upper bits are already zero
+ if (!hasZeroUpperBits(DstReg, MI, MBB))
+ return false;
+
+ // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's
+ // upper bits are already zero, this instruction does nothing.
+ LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ // Iterate through instructions - use a worklist to handle erasures
+ SmallVector<MachineInstr *, 4> ToErase;
+
+ for (MachineInstr &MI : MBB) {
+ if (tryEliminateRedundantZeroExtend(MI, MBB)) {
+ Changed = true;
+ // Note: MI is already erased in tryEliminateRedundantZeroExtend
+ break; // Restart iteration for this block
+ }
+ }
+ }
+
+ return Changed;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d97ccea..72835150e8277 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() {
if (getOptLevel() != CodeGenOptLevel::None) {
addPass(createX86FixupBWInsts());
+ addPass(createX86EliminateRedundantZeroExtend());
addPass(createX86PadShortFunctions());
addPass(createX86FixupLEAs());
addPass(createX86FixupInstTuning());
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
new file mode 100644
index 0000000000000..2c9e46e043187
--- /dev/null
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test that redundant MOVZX instructions are eliminated when the source
+; register is a sub-register of the destination and the destination's upper
+; bits are already known to be zero.
+
+; This is the original countholes test case from GitHub issue that demonstrates
+; the redundant movzbl %cl, %ecx in the loop
+define i32 @countholes(ptr %s) {
+; CHECK-LABEL: countholes:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl (%rdi), %ecx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $48, %cl
+; CHECK-NEXT: jb .LBB0_3
+; CHECK-NEXT: # %bb.1: # %while.body.preheader
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: leaq pre_table(%rip), %rdx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_2: # %while.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: addl $-48, %ecx
+; CHECK-NEXT: addl (%rdx,%rcx,4), %eax
+; CHECK-NEXT: movzbl (%rdi), %ecx
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: cmpb $47, %cl
+; CHECK-NEXT: ja .LBB0_2
+; CHECK-NEXT: .LBB0_3: # %cleanup
+; CHECK-NEXT: retq
+entry:
+ %c.0 = load i8, ptr %s, align 1
+ %conv = zext i8 %c.0 to i32
+ %cmp = icmp ult i8 %c.0, 48
+ br i1 %cmp, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ]
+ %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ]
+ %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+ %conv3 = zext i8 %c.010 to i64
+ %sub = add nsw i64 %conv3, -48
+ %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add i32 %0, %tot.09
+ %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1
+ %c.1 = load i8, ptr %incdec.ptr, align 1
+ %cmp1 = icmp ult i8 %c.1, 48
+ br i1 %cmp1, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ]
+ ret i32 %retval.0
+}
+
+ at pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4
>From c7b29d9daced40d7589b3c3682820b21d174b9a2 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 04:04:08 -0700
Subject: [PATCH 2/5] Fixing broken test on linux
---
.../CodeGen/X86/eliminate-redundant-zext.ll | 9 +-
llvm/test/CodeGen/X86/opt-pipeline.ll | 1 +
llvm/test/CodeGen/X86/pr38539.ll | 2 +-
llvm/test/CodeGen/X86/vector-compress.ll | 1025 ++++++++---------
4 files changed, 508 insertions(+), 529 deletions(-)
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
index 2c9e46e043187..4399841d49876 100644
--- a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -5,7 +5,7 @@
; register is a sub-register of the destination and the destination's upper
; bits are already known to be zero.
-; This is the original countholes test case from GitHub issue that demonstrates
+; This is the original countholes test case from GitHub issue #160710 that demonstrates
; the redundant movzbl %cl, %ecx in the loop
define i32 @countholes(ptr %s) {
; CHECK-LABEL: countholes:
@@ -17,12 +17,11 @@ define i32 @countholes(ptr %s) {
; CHECK-NEXT: # %bb.1: # %while.body.preheader
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: leaq pre_table(%rip), %rdx
-; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .p2align 4{{$}}
; CHECK-NEXT: .LBB0_2: # %while.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: addl $-48, %ecx
-; CHECK-NEXT: addl (%rdx,%rcx,4), %eax
+; CHECK-NOT: movzbl %cl, %ecx
+; CHECK: addl {{.*}}, %eax
; CHECK-NEXT: movzbl (%rdi), %ecx
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: cmpb $47, %cl
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e59d0d0a..01385fb63d6e1 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -202,6 +202,7 @@
; CHECK-NEXT: X86 vzeroupper inserter
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: X86 Byte/Word Instruction Fixup
+; CHECK-NEXT: X86 Eliminate Redundant Zero Extension
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: X86 Atom pad short functions
; CHECK-NEXT: X86 LEA Fixup
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 412455384e937..b633c28a214b7 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,7 +23,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index ac932d51017ae..a9b637931fc9b 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1094,25 +1094,26 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %r13d
-; AVX2-NEXT: vmovd %xmm1, %esi
-; AVX2-NEXT: movl %esi, %eax
+; AVX2-NEXT: vpextrb $1, %xmm1, %r11d
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzbl %al, %edx
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: subb %r11b, %al
+; AVX2-NEXT: vpextrb $2, %xmm1, %esi
+; AVX2-NEXT: subb %sil, %al
+; AVX2-NEXT: vpextrb $3, %xmm1, %r13d
; AVX2-NEXT: subb %r13b, %al
-; AVX2-NEXT: vpextrb $2, %xmm1, %edx
-; AVX2-NEXT: subb %dl, %al
-; AVX2-NEXT: vpextrb $3, %xmm1, %ebp
-; AVX2-NEXT: subb %bpl, %al
; AVX2-NEXT: vpextrb $4, %xmm1, %r12d
; AVX2-NEXT: subb %r12b, %al
; AVX2-NEXT: vpextrb $5, %xmm1, %r15d
; AVX2-NEXT: subb %r15b, %al
; AVX2-NEXT: vpextrb $6, %xmm1, %r14d
; AVX2-NEXT: subb %r14b, %al
-; AVX2-NEXT: vpextrb $7, %xmm1, %ebx
+; AVX2-NEXT: vpextrb $7, %xmm1, %ebp
+; AVX2-NEXT: subb %bpl, %al
+; AVX2-NEXT: vpextrb $8, %xmm1, %ebx
; AVX2-NEXT: subb %bl, %al
-; AVX2-NEXT: vpextrb $8, %xmm1, %r11d
-; AVX2-NEXT: subb %r11b, %al
; AVX2-NEXT: vpextrb $9, %xmm1, %r10d
; AVX2-NEXT: subb %r10b, %al
; AVX2-NEXT: vpextrb $10, %xmm1, %r9d
@@ -1122,94 +1123,108 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpextrb $12, %xmm1, %edi
; AVX2-NEXT: subb %dil, %al
; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi)
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: addq %rsi, %r13
-; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13)
; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %r13, %rdx
-; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT: andl $1, %ebp
-; AVX2-NEXT: addq %rdx, %rbp
-; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp)
-; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: addq %rbp, %r12
-; AVX2-NEXT: andl $1, %r15d
-; AVX2-NEXT: addq %r12, %r15
-; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
-; AVX2-NEXT: andl $15, %r12d
-; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%r12)
-; AVX2-NEXT: andl $1, %r14d
-; AVX2-NEXT: addq %r15, %r14
-; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15
-; AVX2-NEXT: andl $15, %r15d
-; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%r15)
-; AVX2-NEXT: andl $1, %ebx
-; AVX2-NEXT: addq %r14, %rbx
-; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
-; AVX2-NEXT: andl $15, %r14d
-; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%r14)
-; AVX2-NEXT: andl $1, %r11d
-; AVX2-NEXT: addq %rbx, %r11
-; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx
-; AVX2-NEXT: andl $15, %ebx
-; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rbx)
-; AVX2-NEXT: andl $1, %r10d
-; AVX2-NEXT: addq %r11, %r10
-; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
-; AVX2-NEXT: andl $15, %r11d
-; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%r11)
-; AVX2-NEXT: andl $1, %r9d
-; AVX2-NEXT: addq %r10, %r9
-; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX2-NEXT: andl $15, %r10d
-; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%r10)
-; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %r9, %r8
-; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
-; AVX2-NEXT: andl $15, %r9d
-; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%r9)
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: addq %r8, %rdi
-; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
-; AVX2-NEXT: andl $15, %r8d
-; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%r8)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: addq %rdi, %rsi
-; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX2-NEXT: andl $15, %edi
-; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rdi)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT: movzbl %r11b, %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rsi, %rax
-; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rsi)
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl %sil, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl %r12b, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl %r14b, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: cmpq $15, %rcx
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovbq %rcx, %rax
-; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; AVX2-NEXT: movb %cl, -40(%rsp,%rax)
+; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl %bpl, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl %bl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl %r10b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl %r9b, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl %r8b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl %dil, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT: cmpq $15, %rax
+; AVX2-NEXT: movl $15, %ecx
+; AVX2-NEXT: cmovbq %rax, %rcx
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT: movb %al, -40(%rsp,%rcx)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -1790,137 +1805,140 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $96, %rsp
-; AVX2-NEXT: movl %r9d, %r11d
-; AVX2-NEXT: movl %r8d, %r10d
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: movl %edx, %r8d
-; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %ecx, %r13d
+; AVX2-NEXT: movl %edx, %r15d
+; AVX2-NEXT: movl %esi, %ebx
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: movzbl 360(%rbp), %eax
-; AVX2-NEXT: movzbl 352(%rbp), %ecx
+; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl 360(%rbp), %eax
+; AVX2-NEXT: movl 352(%rbp), %ecx
; AVX2-NEXT: vmovd %ecx, %xmm4
; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 368(%rbp), %eax
+; AVX2-NEXT: movl 368(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 376(%rbp), %eax
+; AVX2-NEXT: movl 376(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 384(%rbp), %eax
+; AVX2-NEXT: movl 384(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 392(%rbp), %eax
+; AVX2-NEXT: movl 392(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 400(%rbp), %eax
+; AVX2-NEXT: movl 400(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 408(%rbp), %eax
+; AVX2-NEXT: movl 408(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 416(%rbp), %eax
+; AVX2-NEXT: movl 416(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 424(%rbp), %eax
+; AVX2-NEXT: movl 424(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 432(%rbp), %eax
+; AVX2-NEXT: movl 432(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 440(%rbp), %eax
+; AVX2-NEXT: movl 440(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 448(%rbp), %eax
+; AVX2-NEXT: movl 448(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 456(%rbp), %eax
+; AVX2-NEXT: movl 456(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 464(%rbp), %eax
+; AVX2-NEXT: movl 464(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 472(%rbp), %eax
+; AVX2-NEXT: movl 472(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 224(%rbp), %eax
+; AVX2-NEXT: movl 224(%rbp), %eax
; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movzbl 232(%rbp), %eax
+; AVX2-NEXT: movl 232(%rbp), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 240(%rbp), %eax
+; AVX2-NEXT: movl 240(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 248(%rbp), %eax
+; AVX2-NEXT: movl 248(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 256(%rbp), %eax
+; AVX2-NEXT: movl 256(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 264(%rbp), %eax
+; AVX2-NEXT: movl 264(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 272(%rbp), %eax
+; AVX2-NEXT: movl 272(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 280(%rbp), %eax
+; AVX2-NEXT: movl 280(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 288(%rbp), %eax
+; AVX2-NEXT: movl 288(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 296(%rbp), %eax
+; AVX2-NEXT: movl 296(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 304(%rbp), %eax
+; AVX2-NEXT: movl 304(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 312(%rbp), %eax
+; AVX2-NEXT: movl 312(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 320(%rbp), %eax
+; AVX2-NEXT: movl 320(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 328(%rbp), %eax
+; AVX2-NEXT: movl 328(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 336(%rbp), %eax
+; AVX2-NEXT: movl 336(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 344(%rbp), %eax
+; AVX2-NEXT: movl 344(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: vmovd %edi, %xmm5
-; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 16(%rbp), %ebx
-; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 24(%rbp), %r14d
-; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 32(%rbp), %r15d
-; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 40(%rbp), %r12d
-; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 48(%rbp), %r13d
-; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 56(%rbp), %eax
+; AVX2-NEXT: movl 96(%rbp), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: movl 104(%rbp), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 112(%rbp), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 120(%rbp), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 128(%rbp), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 136(%rbp), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 144(%rbp), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 152(%rbp), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 160(%rbp), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 168(%rbp), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 176(%rbp), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
+; AVX2-NEXT: movl 184(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 64(%rbp), %eax
+; AVX2-NEXT: movl 192(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 72(%rbp), %eax
+; AVX2-NEXT: movl 200(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 80(%rbp), %eax
+; AVX2-NEXT: movl 208(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 88(%rbp), %eax
+; AVX2-NEXT: movl 216(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 96(%rbp), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: movzbl 104(%rbp), %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 112(%rbp), %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 120(%rbp), %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 128(%rbp), %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 136(%rbp), %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 144(%rbp), %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 152(%rbp), %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 160(%rbp), %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 168(%rbp), %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 176(%rbp), %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 184(%rbp), %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 192(%rbp), %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 200(%rbp), %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 208(%rbp), %eax
+; AVX2-NEXT: vmovd %edi, %xmm6
+; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6
+; AVX2-NEXT: movl 16(%rbp), %esi
+; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
+; AVX2-NEXT: movl 24(%rbp), %edi
+; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
+; AVX2-NEXT: movl 32(%rbp), %r8d
+; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
+; AVX2-NEXT: movl 40(%rbp), %r9d
+; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
+; AVX2-NEXT: movl 48(%rbp), %r10d
+; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6
+; AVX2-NEXT: movl 56(%rbp), %r11d
+; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6
+; AVX2-NEXT: movl 64(%rbp), %r14d
+; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX2-NEXT: movl 72(%rbp), %r12d
+; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6
+; AVX2-NEXT: movl 80(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 216(%rbp), %eax
+; AVX2-NEXT: movl 88(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4
@@ -1962,435 +1980,379 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: movzbl (%rsp,%rax), %edx
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
+; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: addq %rax, %rbx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx)
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: addq %rbx, %r15
+; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15)
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: addq %r15, %r13
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %r13, %rcx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: addq %rdi, %rsi
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT: addq %rax, %rsi
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %rsi, %rdi
+; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT: andl $63, %esi
+; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi)
; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %rsi, %r8
-; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8)
+; AVX2-NEXT: addq %rdi, %r8
+; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT: andl $63, %edi
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi)
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9)
+; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT: andl $63, %r8d
+; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8)
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
-; AVX2-NEXT: movl %r10d, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT: andl $63, %r9d
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9)
; AVX2-NEXT: andl $1, %r11d
; AVX2-NEXT: addq %r10, %r11
-; AVX2-NEXT: movzbl %bl, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %r11, %rax
+; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT: andl $63, %r10d
+; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10)
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: addq %r11, %r14
; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
; AVX2-NEXT: andl $63, %r11d
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11)
-; AVX2-NEXT: movzbl %r14b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r15b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl %r12b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r13b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 56(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 64(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 72(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 80(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11)
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: addq %r14, %r12
+; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT: andl $63, %r14d
+; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14)
+; AVX2-NEXT: movl 80(%rbp), %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 88(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: addq %r12, %rax
+; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT: andl $63, %r12d
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12)
+; AVX2-NEXT: movl 88(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 96(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl 96(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 104(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 104(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 112(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 120(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 112(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 120(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 128(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 136(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 128(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 136(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 144(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 152(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 144(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 152(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 160(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 168(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 160(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 168(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 176(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 184(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 176(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 184(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 192(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 200(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 192(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 200(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 208(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 216(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 208(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 216(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 224(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 232(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 224(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 232(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 240(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 248(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 240(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 248(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 256(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 264(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 256(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 264(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 272(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 280(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 272(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 280(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 288(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 296(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 288(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 296(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 304(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 312(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 304(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 312(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 320(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 328(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 320(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 328(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 336(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 344(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 336(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 344(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 352(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl 352(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 360(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 360(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 368(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 376(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 368(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 376(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 384(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 392(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 384(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 392(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 400(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 408(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 400(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 408(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 416(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 424(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 416(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 424(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 432(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 440(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 432(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 440(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 448(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 456(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 448(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 456(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 464(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 472(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 464(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 472(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
; AVX2-NEXT: vpextrb $15, %xmm0, %eax
; AVX2-NEXT: cmpq $64, %rcx
-; AVX2-NEXT: cmovbl %edx, %eax
+; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; AVX2-NEXT: cmpq $63, %rcx
-; AVX2-NEXT: movl $63, %edx
-; AVX2-NEXT: cmovbq %rcx, %rdx
-; AVX2-NEXT: movb %al, (%rsp,%rdx)
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: movl $63, %ecx
+; AVX2-NEXT: cmovbq %rdx, %rcx
+; AVX2-NEXT: movb %al, (%rsp,%rcx)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -40(%rbp), %rsp
@@ -4499,8 +4461,9 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT: vmovaps (%rsp), %ymm0
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
@@ -4511,18 +4474,20 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
; AVX512VL-NEXT: retq
%xvec = zext <8 x i16> %vec to <8 x i64> ; 0 -> 65535
%xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
@@ -4603,8 +4568,18 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT: vmovaps (%rsp), %ymm0
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovdqa (%rsp), %ymm0
+; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vpsllq $48, %ymm0, %ymm2
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm2
+; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vpsllq $48, %ymm1, %ymm2
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm2
+; AVX2-NEXT: vpslld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
@@ -4615,18 +4590,22 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm1
-; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpsllq $48, %zmm1, %zmm0
+; AVX512F-NEXT: vpsraq $48, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm1
-; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpsllq $48, %zmm1, %zmm0
+; AVX512VL-NEXT: vpsraq $48, %zmm0, %zmm0
; AVX512VL-NEXT: retq
%xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
%xpassthru = and <8 x i64> %passthru, splat(i64 3)
>From ae48367bdc111c0951107418831707b6bb59c15f Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 10:50:32 -0700
Subject: [PATCH 3/5] Addressing latest review
---
llvm/test/CodeGen/X86/eliminate-redundant-zext.ll | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
index 4399841d49876..294a6e7f780e3 100644
--- a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -17,11 +17,10 @@ define i32 @countholes(ptr %s) {
; CHECK-NEXT: # %bb.1: # %while.body.preheader
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: .p2align 4{{$}}
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %while.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NOT: movzbl %cl, %ecx
-; CHECK: addl {{.*}}, %eax
+; CHECK-NEXT: addl pre_table-192(,%rcx,4), %eax
; CHECK-NEXT: movzbl (%rdi), %ecx
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: cmpb $47, %cl
>From 7d9e8ec6ad93b3fad8d6ed0f3c7b17ac5314b889 Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 15:31:11 -0700
Subject: [PATCH 4/5] Fixing the tests.
---
llvm/test/CodeGen/X86/atomic-rm-bit-test.ll | 10 ----------
llvm/test/CodeGen/X86/ctlz.ll | 2 --
llvm/test/CodeGen/X86/isel-select-cmov.ll | 4 ----
llvm/test/CodeGen/X86/isel-udiv.ll | 1 -
llvm/test/CodeGen/X86/isel-urem.ll | 1 -
llvm/test/CodeGen/X86/popcnt.ll | 2 --
llvm/test/CodeGen/X86/sttni.ll | 5 -----
7 files changed, 25 deletions(-)
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index b4d40fee01e41..a283a002d9818 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -177,7 +177,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
; X86-NEXT: lock cmpxchgb %cl, (%esi)
; X86-NEXT: jne .LBB3_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: testl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
@@ -198,7 +197,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
; X64-NEXT: lock cmpxchgb %cl, (%rdi)
; X64-NEXT: jne .LBB3_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: testl %eax, %edx
; X64-NEXT: sete %al
; X64-NEXT: retq
@@ -233,7 +231,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw
; X86-NEXT: lock cmpxchgb %cl, (%esi)
; X86-NEXT: jne .LBB4_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movzbl %dl, %ecx
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setae %al
@@ -255,7 +252,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw
; X64-NEXT: lock cmpxchgb %cl, (%rdi)
; X64-NEXT: jne .LBB4_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: movzbl %sil, %ecx
; X64-NEXT: btl %ecx, %eax
; X64-NEXT: setae %al
@@ -291,7 +287,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun
; X86-NEXT: lock cmpxchgb %cl, (%edx)
; X86-NEXT: jne .LBB5_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: testl %eax, %ebx
; X86-NEXT: sete %al
; X86-NEXT: popl %ebx
@@ -313,7 +308,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun
; X64-NEXT: lock cmpxchgb %cl, (%rdi)
; X64-NEXT: jne .LBB5_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: testl %eax, %edx
; X64-NEXT: sete %al
; X64-NEXT: retq
@@ -349,7 +343,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
; X86-NEXT: lock cmpxchgb %ch, (%edx)
; X86-NEXT: jne .LBB6_1
; X86-NEXT: # %bb.2: # %atomicrmw.end
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: testl %eax, %ebx
; X86-NEXT: je .LBB6_3
; X86-NEXT: # %bb.4: # %if.then
@@ -378,7 +371,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
; X64-NEXT: lock cmpxchgb %r8b, (%rdi)
; X64-NEXT: jne .LBB6_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: testl %eax, %edx
; X64-NEXT: je .LBB6_3
; X64-NEXT: # %bb.4: # %if.then
@@ -512,7 +504,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun
; X86-NEXT: testl %ecx, %ebx
; X86-NEXT: je .LBB8_3
; X86-NEXT: # %bb.4: # %if.then
-; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: movzbl (%edx,%eax), %eax
; X86-NEXT: popl %ebx
; X86-NEXT: retl
@@ -538,7 +529,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun
; X64-NEXT: lock cmpxchgb %r8b, (%rdi)
; X64-NEXT: jne .LBB8_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: testl %eax, %edx
; X64-NEXT: je .LBB8_3
; X64-NEXT: # %bb.4: # %if.then
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 1267fe9033454..a3d28a7fcba24 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -224,7 +224,6 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
; X86-NOCMOV-NEXT: testb %al, %al
; X86-NOCMOV-NEXT: je .LBB4_1
; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
-; X86-NOCMOV-NEXT: movzbl %al, %eax
; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $7, %eax
; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
@@ -961,7 +960,6 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
; X86-NOCMOV-NEXT: testb %al, %al
; X86-NOCMOV-NEXT: je .LBB16_1
; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
-; X86-NOCMOV-NEXT: movzbl %al, %eax
; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $7, %eax
; X86-NOCMOV-NEXT: xorb $7, %al
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index d013ad2c7fbff..783db3487e2bd 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -73,11 +73,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
; FAST-X86-NEXT: jne LBB0_1
; FAST-X86-NEXT: ## %bb.2:
; FAST-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-NEXT: movzbl %al, %eax
; FAST-X86-NEXT: retl
; FAST-X86-NEXT: LBB0_1:
; FAST-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-NEXT: movzbl %al, %eax
; FAST-X86-NEXT: retl
;
; FAST-X86-CMOV-LABEL: select_cmov_i8:
@@ -86,11 +84,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
; FAST-X86-CMOV-NEXT: jne LBB0_1
; FAST-X86-CMOV-NEXT: ## %bb.2:
; FAST-X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-CMOV-NEXT: movzbl %al, %eax
; FAST-X86-CMOV-NEXT: retl
; FAST-X86-CMOV-NEXT: LBB0_1:
; FAST-X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; FAST-X86-CMOV-NEXT: movzbl %al, %eax
; FAST-X86-CMOV-NEXT: retl
;
; GISEL-X86-LABEL: select_cmov_i8:
diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll
index b123b3c7780fa..f96a12c2fafd0 100644
--- a/llvm/test/CodeGen/X86/isel-udiv.ll
+++ b/llvm/test/CodeGen/X86/isel-udiv.ll
@@ -22,7 +22,6 @@ define i8 @test_udiv_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_udiv_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; GISEL-X86-NEXT: divb %cl
; GISEL-X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll
index 386f08151ad9c..5dd901fe8daa6 100644
--- a/llvm/test/CodeGen/X86/isel-urem.ll
+++ b/llvm/test/CodeGen/X86/isel-urem.ll
@@ -49,7 +49,6 @@ define i8 @test_urem_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_urem_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; GISEL-X86-NEXT: divb %cl
; GISEL-X86-NEXT: movb %ah, %al
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 3004b8b72fcc5..cd5edffd8ccda 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -67,7 +67,6 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@@ -1840,7 +1839,6 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
;
; X64-BASE-LABEL: popcount_i16_zext:
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 39cbee54737c3..b0c92831124bf 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -89,7 +89,6 @@ define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs,
; X64-NEXT: jne .LBB2_2
; X64-NEXT: # %bb.1:
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB2_2: # %compare
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -222,7 +221,6 @@ define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
; X64-NEXT: jne .LBB5_2
; X64-NEXT: # %bb.1:
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB5_2: # %compare
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
@@ -552,7 +550,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
; X86-NEXT: jne .LBB14_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
; X86-NEXT: .LBB14_2: # %compare
; X86-NEXT: pushl %ebp
@@ -577,7 +574,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
; X64-NEXT: jne .LBB14_2
; X64-NEXT: # %bb.1:
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB14_2: # %compare
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -690,7 +686,6 @@ define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
; X64-NEXT: jne .LBB17_2
; X64-NEXT: # %bb.1:
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB17_2: # %compare
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
>From 0d9c3131428f6b7c5dfdc185d97ed60dd3ddcadd Mon Sep 17 00:00:00 2001
From: Francisco Geiman Thiesen <franciscogthiesen at gmail.com>
Date: Thu, 2 Oct 2025 20:22:17 -0700
Subject: [PATCH 5/5] Fixing 2 failing tests.
---
llvm/test/CodeGen/X86/pr38539.ll | 3 +-
llvm/test/CodeGen/X86/vector-compress.ll | 1027 +++++++++++-----------
2 files changed, 524 insertions(+), 506 deletions(-)
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index b633c28a214b7..147abcdbff0b9 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,12 +23,11 @@ define void @f() nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
; X86-NEXT: movzbl (%eax), %ecx
-; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: divb %cl
; X86-NEXT: movl %edi, %eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index a9b637931fc9b..53e6e49268789 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1094,26 +1094,25 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %r11d
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzbl %al, %edx
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: vpextrb $1, %xmm1, %r13d
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movl %esi, %eax
; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: subb %r11b, %al
-; AVX2-NEXT: vpextrb $2, %xmm1, %esi
-; AVX2-NEXT: subb %sil, %al
-; AVX2-NEXT: vpextrb $3, %xmm1, %r13d
; AVX2-NEXT: subb %r13b, %al
+; AVX2-NEXT: vpextrb $2, %xmm1, %edx
+; AVX2-NEXT: subb %dl, %al
+; AVX2-NEXT: vpextrb $3, %xmm1, %ebp
+; AVX2-NEXT: subb %bpl, %al
; AVX2-NEXT: vpextrb $4, %xmm1, %r12d
; AVX2-NEXT: subb %r12b, %al
; AVX2-NEXT: vpextrb $5, %xmm1, %r15d
; AVX2-NEXT: subb %r15b, %al
; AVX2-NEXT: vpextrb $6, %xmm1, %r14d
; AVX2-NEXT: subb %r14b, %al
-; AVX2-NEXT: vpextrb $7, %xmm1, %ebp
-; AVX2-NEXT: subb %bpl, %al
-; AVX2-NEXT: vpextrb $8, %xmm1, %ebx
+; AVX2-NEXT: vpextrb $7, %xmm1, %ebx
; AVX2-NEXT: subb %bl, %al
+; AVX2-NEXT: vpextrb $8, %xmm1, %r11d
+; AVX2-NEXT: subb %r11b, %al
; AVX2-NEXT: vpextrb $9, %xmm1, %r10d
; AVX2-NEXT: subb %r10b, %al
; AVX2-NEXT: vpextrb $10, %xmm1, %r9d
@@ -1123,108 +1122,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpextrb $12, %xmm1, %edi
; AVX2-NEXT: subb %dil, %al
; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi)
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: addq %rsi, %r13
+; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13)
; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT: movzbl %r11b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %sil, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r13b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r12b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: movzbl %r15b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r14b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %bpl, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %bl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r10b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r9b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r8b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %dil, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT: addq %r13, %rdx
+; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT: andl $1, %ebp
+; AVX2-NEXT: addq %rdx, %rbp
+; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp)
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: addq %rbp, %r12
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: addq %r12, %r15
+; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT: andl $15, %r12d
+; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%r12)
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: addq %r15, %r14
+; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15
+; AVX2-NEXT: andl $15, %r15d
+; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%r15)
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: addq %r14, %rbx
+; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT: andl $15, %r14d
+; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%r14)
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: addq %rbx, %r11
+; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx
+; AVX2-NEXT: andl $15, %ebx
+; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rbx)
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: addq %r11, %r10
+; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT: andl $15, %r11d
+; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%r11)
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: addq %r10, %r9
+; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT: andl $15, %r10d
+; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%r10)
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: addq %r9, %r8
+; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT: andl $15, %r9d
+; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%r9)
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT: andl $15, %r8d
+; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%r8)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rdi, %rsi
+; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT: andl $15, %edi
+; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rdi)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT: addq %rsi, %rax
+; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rsi)
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: cmpq $15, %rax
-; AVX2-NEXT: movl $15, %ecx
-; AVX2-NEXT: cmovbq %rax, %rcx
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; AVX2-NEXT: movb %al, -40(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: cmpq $15, %rcx
+; AVX2-NEXT: movl $15, %eax
+; AVX2-NEXT: cmovbq %rcx, %rax
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; AVX2-NEXT: movb %cl, -40(%rsp,%rax)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -1805,140 +1790,137 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
-; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %ecx, %r13d
-; AVX2-NEXT: movl %edx, %r15d
-; AVX2-NEXT: movl %esi, %ebx
+; AVX2-NEXT: subq $96, %rsp
+; AVX2-NEXT: movl %r9d, %r11d
+; AVX2-NEXT: movl %r8d, %r10d
+; AVX2-NEXT: movl %ecx, %r9d
+; AVX2-NEXT: movl %edx, %r8d
+; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl 360(%rbp), %eax
-; AVX2-NEXT: movl 352(%rbp), %ecx
+; AVX2-NEXT: movzbl 360(%rbp), %eax
+; AVX2-NEXT: movzbl 352(%rbp), %ecx
; AVX2-NEXT: vmovd %ecx, %xmm4
; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 368(%rbp), %eax
+; AVX2-NEXT: movzbl 368(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 376(%rbp), %eax
+; AVX2-NEXT: movzbl 376(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 384(%rbp), %eax
+; AVX2-NEXT: movzbl 384(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 392(%rbp), %eax
+; AVX2-NEXT: movzbl 392(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 400(%rbp), %eax
+; AVX2-NEXT: movzbl 400(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 408(%rbp), %eax
+; AVX2-NEXT: movzbl 408(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 416(%rbp), %eax
+; AVX2-NEXT: movzbl 416(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 424(%rbp), %eax
+; AVX2-NEXT: movzbl 424(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 432(%rbp), %eax
+; AVX2-NEXT: movzbl 432(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 440(%rbp), %eax
+; AVX2-NEXT: movzbl 440(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 448(%rbp), %eax
+; AVX2-NEXT: movzbl 448(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 456(%rbp), %eax
+; AVX2-NEXT: movzbl 456(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 464(%rbp), %eax
+; AVX2-NEXT: movzbl 464(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 472(%rbp), %eax
+; AVX2-NEXT: movzbl 472(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 224(%rbp), %eax
+; AVX2-NEXT: movzbl 224(%rbp), %eax
; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movl 232(%rbp), %eax
+; AVX2-NEXT: movzbl 232(%rbp), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 240(%rbp), %eax
+; AVX2-NEXT: movzbl 240(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 248(%rbp), %eax
+; AVX2-NEXT: movzbl 248(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 256(%rbp), %eax
+; AVX2-NEXT: movzbl 256(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 264(%rbp), %eax
+; AVX2-NEXT: movzbl 264(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 272(%rbp), %eax
+; AVX2-NEXT: movzbl 272(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 280(%rbp), %eax
+; AVX2-NEXT: movzbl 280(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 288(%rbp), %eax
+; AVX2-NEXT: movzbl 288(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 296(%rbp), %eax
+; AVX2-NEXT: movzbl 296(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 304(%rbp), %eax
+; AVX2-NEXT: movzbl 304(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 312(%rbp), %eax
+; AVX2-NEXT: movzbl 312(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 320(%rbp), %eax
+; AVX2-NEXT: movzbl 320(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 328(%rbp), %eax
+; AVX2-NEXT: movzbl 328(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 336(%rbp), %eax
+; AVX2-NEXT: movzbl 336(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 344(%rbp), %eax
+; AVX2-NEXT: movzbl 344(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: movl 96(%rbp), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movl 104(%rbp), %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 112(%rbp), %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 120(%rbp), %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 128(%rbp), %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 136(%rbp), %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 144(%rbp), %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 152(%rbp), %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 160(%rbp), %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 168(%rbp), %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 176(%rbp), %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 184(%rbp), %eax
+; AVX2-NEXT: vmovd %edi, %xmm5
+; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 16(%rbp), %ebx
+; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 24(%rbp), %r14d
+; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 32(%rbp), %r15d
+; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 40(%rbp), %r12d
+; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 48(%rbp), %r13d
+; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 56(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 192(%rbp), %eax
+; AVX2-NEXT: movzbl 64(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 200(%rbp), %eax
+; AVX2-NEXT: movzbl 72(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 208(%rbp), %eax
+; AVX2-NEXT: movzbl 80(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 216(%rbp), %eax
+; AVX2-NEXT: movzbl 88(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT: vmovd %edi, %xmm6
-; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6
-; AVX2-NEXT: movl 16(%rbp), %esi
-; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
-; AVX2-NEXT: movl 24(%rbp), %edi
-; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
-; AVX2-NEXT: movl 32(%rbp), %r8d
-; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
-; AVX2-NEXT: movl 40(%rbp), %r9d
-; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
-; AVX2-NEXT: movl 48(%rbp), %r10d
-; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6
-; AVX2-NEXT: movl 56(%rbp), %r11d
-; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6
-; AVX2-NEXT: movl 64(%rbp), %r14d
-; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
-; AVX2-NEXT: movl 72(%rbp), %r12d
-; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6
-; AVX2-NEXT: movl 80(%rbp), %eax
+; AVX2-NEXT: movzbl 96(%rbp), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: movzbl 104(%rbp), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 112(%rbp), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 120(%rbp), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 128(%rbp), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 136(%rbp), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 144(%rbp), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 152(%rbp), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 160(%rbp), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 168(%rbp), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 176(%rbp), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 184(%rbp), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 192(%rbp), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 200(%rbp), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 208(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movl 88(%rbp), %eax
+; AVX2-NEXT: movzbl 216(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4
@@ -1980,379 +1962,434 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: movzbl (%rsp,%rax), %eax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movzbl (%rsp,%rax), %edx
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: andl $1, %ebx
-; AVX2-NEXT: addq %rax, %rbx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx)
-; AVX2-NEXT: andl $1, %r15d
-; AVX2-NEXT: addq %rbx, %r15
-; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15)
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: addq %r15, %r13
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %r13, %rcx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: addq %rax, %rsi
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: addq %rsi, %rdi
-; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX2-NEXT: andl $63, %esi
-; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rdi, %rsi
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi)
; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %rdi, %r8
-; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX2-NEXT: andl $63, %edi
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: addq %rsi, %r8
+; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8)
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
-; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
-; AVX2-NEXT: andl $63, %r8d
-; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9)
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
-; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
-; AVX2-NEXT: andl $63, %r9d
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT: movl %r10d, %eax
+; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
; AVX2-NEXT: andl $1, %r11d
; AVX2-NEXT: addq %r10, %r11
-; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX2-NEXT: andl $63, %r10d
-; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10)
-; AVX2-NEXT: andl $1, %r14d
-; AVX2-NEXT: addq %r11, %r14
+; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %r11, %rax
; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
; AVX2-NEXT: andl $63, %r11d
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11)
-; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: addq %r14, %r12
-; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
-; AVX2-NEXT: andl $63, %r14d
-; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14)
-; AVX2-NEXT: movl 80(%rbp), %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11)
+; AVX2-NEXT: movzbl %r14b, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl %r15b, %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %r12, %rax
-; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
-; AVX2-NEXT: andl $63, %r12d
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12)
-; AVX2-NEXT: movl 88(%rbp), %ecx
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl %r12b, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 96(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 56(%rbp), %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 64(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 72(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 80(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 88(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 96(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 104(%rbp), %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 104(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 112(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 120(%rbp), %ecx
+; AVX2-NEXT: movzbl 112(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 120(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 128(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 136(%rbp), %ecx
+; AVX2-NEXT: movzbl 128(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 136(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 144(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 152(%rbp), %ecx
+; AVX2-NEXT: movzbl 144(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 152(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 160(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 168(%rbp), %ecx
+; AVX2-NEXT: movzbl 160(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 168(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 176(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 184(%rbp), %ecx
+; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 176(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 184(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 192(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 200(%rbp), %ecx
+; AVX2-NEXT: movzbl 192(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 200(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 208(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 216(%rbp), %ecx
+; AVX2-NEXT: movzbl 208(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 216(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 224(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 232(%rbp), %ecx
+; AVX2-NEXT: movzbl 224(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 232(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 240(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 248(%rbp), %ecx
+; AVX2-NEXT: movzbl 240(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 248(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 256(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 264(%rbp), %ecx
+; AVX2-NEXT: movzbl 256(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 264(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 272(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 280(%rbp), %ecx
+; AVX2-NEXT: movzbl 272(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 280(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 288(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 296(%rbp), %ecx
+; AVX2-NEXT: movzbl 288(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 296(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 304(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 312(%rbp), %ecx
+; AVX2-NEXT: movzbl 304(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 312(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 320(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 328(%rbp), %ecx
+; AVX2-NEXT: movzbl 320(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 328(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 336(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 344(%rbp), %ecx
+; AVX2-NEXT: movzbl 336(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 344(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 352(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: movzbl 352(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 360(%rbp), %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 360(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 368(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 376(%rbp), %ecx
+; AVX2-NEXT: movzbl 368(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 376(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 384(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 392(%rbp), %ecx
+; AVX2-NEXT: movzbl 384(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 392(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 400(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 408(%rbp), %ecx
+; AVX2-NEXT: movzbl 400(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 408(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 416(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 424(%rbp), %ecx
+; AVX2-NEXT: movzbl 416(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 424(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 432(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 440(%rbp), %ecx
+; AVX2-NEXT: movzbl 432(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 440(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 448(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 456(%rbp), %ecx
+; AVX2-NEXT: movzbl 448(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 456(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 464(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 472(%rbp), %ecx
+; AVX2-NEXT: movzbl 464(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 472(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
; AVX2-NEXT: vpextrb $15, %xmm0, %eax
; AVX2-NEXT: cmpq $64, %rcx
-; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT: cmovbl %edx, %eax
; AVX2-NEXT: cmpq $63, %rcx
-; AVX2-NEXT: movq %rcx, %rdx
-; AVX2-NEXT: movl $63, %ecx
-; AVX2-NEXT: cmovbq %rdx, %rcx
-; AVX2-NEXT: movb %al, (%rsp,%rcx)
+; AVX2-NEXT: movl $63, %edx
+; AVX2-NEXT: cmovbq %rcx, %rdx
+; AVX2-NEXT: movb %al, (%rsp,%rdx)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -40(%rbp), %rsp
@@ -3310,7 +3347,6 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX2-NEXT: addl %r8d, %r9d
; AVX2-NEXT: movzbl 16(%rbp), %ecx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4)
-; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %r9d, %ecx
; AVX2-NEXT: movzbl 24(%rbp), %edx
@@ -4461,9 +4497,8 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
+; AVX2-NEXT: vmovaps (%rsp), %ymm0
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
@@ -4474,20 +4509,18 @@ define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
%xvec = zext <8 x i16> %vec to <8 x i64> ; 0 -> 65535
%xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
@@ -4568,18 +4601,8 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
; AVX2-NEXT: cmovbq %r11, %rax
; AVX2-NEXT: movl %eax, %eax
; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
-; AVX2-NEXT: vmovdqa (%rsp), %ymm0
-; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT: vpsllq $48, %ymm0, %ymm2
-; AVX2-NEXT: vpsrad $31, %ymm2, %ymm2
-; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpsllq $48, %ymm1, %ymm2
-; AVX2-NEXT: vpsrad $31, %ymm2, %ymm2
-; AVX2-NEXT: vpslld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vmovaps (%rsp), %ymm0
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -8(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
@@ -4590,22 +4613,18 @@ define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpsllq $48, %zmm1, %zmm0
-; AVX512F-NEXT: vpsraq $48, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm1
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
-; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm1
-; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vpsllq $48, %zmm1, %zmm0
-; AVX512VL-NEXT: vpsraq $48, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm1
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
%xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
%xpassthru = and <8 x i64> %passthru, splat(i64 3)
More information about the llvm-commits
mailing list